<para>
<programlisting>
Size
-amestimateparallelscan (void);
+amestimateparallelscan (int nkeys,
+ int norderbys);
</programlisting>
Estimate and return the number of bytes of dynamic shared memory which
the access method will be needed to perform a parallel scan. (This number
AM-independent data in <structname>ParallelIndexScanDescData</structname>.)
</para>
+ <para>
+ The <literal>nkeys</literal> and <literal>norderbys</literal>
+ parameters indicate the number of quals and ordering operators that will be
+ used in the scan; the same values will be passed to <function>amrescan</function>.
+ Note that the actual values of the scan keys aren't provided yet.
+ </para>
+
<para>
It is not necessary to implement this function for access methods which
do not support parallel scans or for which the number of additional bytes
</para>
</note>
+ <note>
+ <para>
+ Queries that use certain <acronym>SQL</acronym> constructs to search for
+ rows matching any value out of a list or array of multiple scalar values
+ (see <xref linkend="functions-comparisons"/>) perform multiple
+ <quote>primitive</quote> index scans (up to one primitive scan per scalar
+ value) during query execution. Each internal primitive index scan
+ increments <structname>pg_stat_all_indexes</structname>.<structfield>idx_scan</structfield>,
+ so it's possible for the count of index scans to significantly exceed the
+ total number of index scan executor node executions.
+ </para>
+ </note>
+
</sect2>
<sect2 id="monitoring-pg-statio-all-tables-view">
/*
* index_parallelscan_estimate - estimate shared memory for parallel scan
- *
- * Currently, we don't pass any information to the AM-specific estimator,
- * so it can probably only return a constant. In the future, we might need
- * to pass more information.
*/
Size
-index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys,
+ Snapshot snapshot)
{
Size nbytes;
*/
if (indexRelation->rd_indam->amestimateparallelscan != NULL)
nbytes = add_size(nbytes,
- indexRelation->rd_indam->amestimateparallelscan());
+ indexRelation->rd_indam->amestimateparallelscan(nkeys,
+ norderbys));
return nbytes;
}
/*
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
*
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
* a new page; others must wait.
*
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
*/
typedef enum
{
BTPARALLEL_NOT_INITIALIZED,
+ BTPARALLEL_NEED_PRIMSCAN,
BTPARALLEL_ADVANCING,
BTPARALLEL_IDLE,
BTPARALLEL_DONE,
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
- int btps_arrayKeyCount; /* count indicating number of array scan
- * keys processed by parallel scan */
- slock_t btps_mutex; /* protects above variables */
+ slock_t btps_mutex; /* protects above variables, btps_arrElems */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
+
+ /*
+ * btps_arrElems is used when scans need to schedule another primitive
+ * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+ */
+ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
} BTParallelScanDescData;
typedef struct BTParallelScanDescData *BTParallelScanDesc;
/* btree indexes are never lossy */
scan->xs_recheck = false;
- /*
- * If we have any array keys, initialize them during first call for a
- * scan. We can't do this in btrescan because we don't know the scan
- * direction at that time.
- */
- if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return false;
-
- _bt_start_array_keys(scan, dir);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/*
/* If we have a tuple, return it ... */
if (res)
break;
- /* ... otherwise see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+ /* ... otherwise see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
return res;
}
int64 ntids = 0;
ItemPointer heapTid;
- /*
- * If we have any array keys, initialize them.
- */
- if (so->numArrayKeys)
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return ntids;
-
- _bt_start_array_keys(scan, ForwardScanDirection);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/* Fetch the first page & tuple */
ntids++;
}
}
- /* Now see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+ /* Now see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
return ntids;
}
else
so->keyData = NULL;
- so->arrayKeyData = NULL; /* assume no array keys for now */
- so->arraysStarted = false;
- so->numArrayKeys = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
so->arrayKeys = NULL;
+ so->orderProcs = NULL;
so->arrayContext = NULL;
so->killedItems = NULL; /* until needed */
}
so->markItemIndex = -1;
- so->arrayKeyCount = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos);
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
-
- /* If any keys are SK_SEARCHARRAY type, set up array-key info */
- _bt_preprocess_array_keys(scan);
+ so->numArrayKeys = 0; /* ditto */
}
/*
/* Release storage */
if (so->keyData != NULL)
pfree(so->keyData);
- /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+ /* so->arrayKeys and so->orderProcs are in arrayContext */
if (so->arrayContext != NULL)
MemoryContextDelete(so->arrayContext);
if (so->killedItems != NULL)
BTScanPosInvalidate(so->markPos);
so->markItemIndex = -1;
}
-
- /* Also record the current positions of any array keys */
- if (so->numArrayKeys)
- _bt_mark_array_keys(scan);
}
/*
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* Restore the marked positions of any array keys */
- if (so->numArrayKeys)
- _bt_restore_array_keys(scan);
-
if (so->markItemIndex >= 0)
{
/*
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
+ /* Reset the scan's array keys (see _bt_steppage for why) */
+ if (so->numArrayKeys)
+ {
+ _bt_start_array_keys(scan, so->currPos.dir);
+ so->needPrimScan = false;
+ }
}
else
BTScanPosInvalidate(so->currPos);
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
*/
Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
{
- return sizeof(BTParallelScanDescData);
+ /* Pessimistically assume all input scankeys will be output with arrays */
+ return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
}
/*
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- bt_target->btps_arrayKeyCount = 0;
ConditionVariableInit(&bt_target->btps_cv);
}
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount = 0;
SpinLockRelease(&btscan->btps_mutex);
}
* or _bt_parallel_done().
*
* The return value is true if we successfully seized the scan and false
- * if we did not. The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not. The latter case occurs if no pages remain.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false). We just return false
+ * for first=false callers that require another primitive index scan.
*/
bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BTPS_State pageStatus;
bool exit_loop = false;
bool status = true;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
*pageno = P_NONE;
+ if (first)
+ {
+ /*
+ * Initialize array related state when called from _bt_first, assuming
+ * that this will either be the first primitive index scan for the
+ * scan, or a previous explicitly scheduled primitive scan.
+ *
+ * Note: so->needPrimScan is only set when a scheduled primitive index
+ * scan is set to be performed in caller's worker process. It should
+ * not be set here by us for the first primitive scan, nor should we
+ * ever set it for a parallel scan that has no array keys.
+ */
+ so->needPrimScan = false;
+ so->scanBehind = false;
+ }
+ else
+ {
+ /*
+ * Don't attempt to seize the scan when backend requires another
+ * primitive index scan unless we're in a position to start it now
+ */
+ if (so->needPrimScan)
+ return false;
+ }
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
while (1)
{
SpinLockAcquire(&btscan->btps_mutex);
- pageStatus = btscan->btps_pageStatus;
- if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+ if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
- /* Parallel scan has already advanced to a new set of scankeys. */
+ /* We're done with this parallel index scan */
status = false;
}
- else if (pageStatus == BTPARALLEL_DONE)
+ else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
{
+ Assert(so->numArrayKeys);
+
/*
- * We're done with this set of scankeys. This may be the end, or
- * there could be more sets to try.
+ * If we can start another primitive scan right away, do so.
+ * Otherwise just wait.
*/
- status = false;
+ if (first)
+ {
+ btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+ ScanKey skey = &so->keyData[array->scan_key];
+
+ array->cur_elem = btscan->btps_arrElems[i];
+ skey->sk_argument = array->elem_values[array->cur_elem];
+ }
+ so->needPrimScan = true;
+ so->scanBehind = false;
+ *pageno = InvalidBlockNumber;
+ exit_loop = true;
+ }
}
- else if (pageStatus != BTPARALLEL_ADVANCING)
+ else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
{
/*
* We have successfully seized control of the scan for the purpose
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required. If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
void
_bt_parallel_done(IndexScanDesc scan)
{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
bool status_changed = false;
parallel_scan->ps_offset);
/*
- * Mark the parallel scan as done for this combination of scan keys,
- * unless some other process already did so. See also
- * _bt_advance_array_keys.
+ * Mark the parallel scan as done, unless some other process did so
+ * already
*/
SpinLockAcquire(&btscan->btps_mutex);
- if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
- btscan->btps_pageStatus != BTPARALLEL_DONE)
+ if (btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
status_changed = true;
}
/*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- * keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend. Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
*/
void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
+ Assert(so->numArrayKeys);
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
- so->arrayKeyCount++;
SpinLockAcquire(&btscan->btps_mutex);
- if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+ if (btscan->btps_scanPage == prev_scan_page &&
+ btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
btscan->btps_scanPage = InvalidBlockNumber;
- btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount++;
+ btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+ /* Serialize scan's current array keys */
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+ btscan->btps_arrElems[i] = array->cur_elem;
+ }
}
SpinLockRelease(&btscan->btps_mutex);
}
*/
if (!so->qual_ok)
{
- /* Notify any other workers that we're done with this scan key. */
_bt_parallel_done(scan);
return false;
}
* scan has not started, proceed to find out first leaf page in the usual
* way while keeping other participating processes waiting. If the scan
* has already begun, use the page number from the shared structure.
+ *
+ * When a parallel scan has another primitive index scan scheduled, a
+ * parallel worker will seize the scan for that purpose now. This is
+ * similar to the case where the top-level scan hasn't started.
*/
if (scan->parallel_scan != NULL)
{
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, true);
+
+ /*
+ * Initialize arrays (when _bt_parallel_seize didn't already set up
+ * the next primitive index scan)
+ */
+ if (so->numArrayKeys && !so->needPrimScan)
+ _bt_start_array_keys(scan, dir);
+
if (!status)
return false;
else if (blkno == P_NONE)
goto readcomplete;
}
}
+ else if (so->numArrayKeys && !so->needPrimScan)
+ {
+ /*
+ * First _bt_first call (for current btrescan) without parallelism.
+ *
+ * Initialize arrays, and the corresponding scan keys that were just
+ * output by _bt_preprocess_keys.
+ */
+ _bt_start_array_keys(scan, dir);
+ }
/*----------
* Examine the scan keys to discover where we need to start the scan.
*
* The selected scan keys (at most one per index column) are remembered by
* storing their addresses into the local startKeys[] array.
+ *
+ * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+ * the next primitive index scan (for scans with array keys) based in part
+ * on an understanding of how it'll enable us to reposition the scan.
+ * They're directly aware of how we'll sometimes cons up an explicit
+ * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
+ * symmetric "deduce NOT NULL" rule of their own. This allows top-level
+ * scans to skip large groups of NULLs through repeated deductions about
+ * key strictness (for a required inequality key) and whether NULLs in the
+ * key's index column are stored last or first (relative to non-NULLs).
+ * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+ * need to be kept in sync.
*----------
*/
strat_total = BTEqualStrategyNumber;
* We scan the current page starting at offnum and moving in the indicated
* direction. All items matching the scan keys are loaded into currPos.items.
* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction.
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
*
* _bt_first caller passes us an offnum returned by _bt_binsrch, which might
* be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int itemIndex;
- bool continuescan;
- int indnatts;
- bool continuescanPrechecked;
- bool haveFirstMatch = false;
+ BTReadPageState pstate;
+ bool arrayKeys;
+ int itemIndex,
+ indnatts;
/*
* We must have the buffer pinned and locked, but the usual macro can't be
if (scan->parallel_scan)
{
if (ScanDirectionIsForward(dir))
- _bt_parallel_release(scan, opaque->btpo_next);
+ pstate.prev_scan_page = opaque->btpo_next;
else
- _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+ pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+
+ _bt_parallel_release(scan, pstate.prev_scan_page);
}
- continuescan = true; /* default assumption */
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+ arrayKeys = so->numArrayKeys != 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
+ /* initialize page-level state that we'll pass to _bt_checkkeys */
+ pstate.dir = dir;
+ pstate.minoff = minoff;
+ pstate.maxoff = maxoff;
+ pstate.finaltup = NULL;
+ pstate.page = page;
+ pstate.offnum = InvalidOffsetNumber;
+ pstate.skip = InvalidOffsetNumber;
+ pstate.continuescan = true; /* default assumption */
+ pstate.prechecked = false;
+ pstate.firstmatch = false;
+ pstate.rechecks = 0;
+ pstate.targetdistance = 0;
+
/*
* We note the buffer's block number so that we can release the pin later.
* This allows us to re-read the buffer if it is needed again for hinting.
* corresponding value from the last item on the page. So checking with
* the last item on the page would give a more precise answer.
*
- * We skip this for the first page in the scan to evade the possible
- * slowdown of the point queries.
+ * We skip this for the first page read by each (primitive) scan, to avoid
+ * slowing down point queries. They typically don't stand to gain much
+ * when the optimization can be applied, and are more likely to notice the
+ * overhead of the precheck.
+ *
+ * The optimization is unsafe and must be avoided whenever _bt_checkkeys
+ * just set a low-order required array's key to the best available match
+ * for a truncated -inf attribute value from the prior page's high key
+ * (array element 0 is always the best available match in this scenario).
+ * It's quite likely that matches for array element 0 begin on this page,
+ * but the start of matches won't necessarily align with page boundaries.
+ * When the start of matches is somewhere in the middle of this page, it
+ * would be wrong to treat page's final non-pivot tuple as representative.
+ * Doing so might lead us to treat some of the page's earlier tuples as
+ * being part of a group of tuples thought to satisfy the required keys.
+ *
+ * Note: Conversely, in the case where the scan's arrays just advanced
+ * using the prior page's HIKEY _without_ advancement setting scanBehind,
+ * the start of matches must be aligned with page boundaries, which makes
+ * it safe to attempt the optimization here now. It's also safe when the
+ * prior page's HIKEY simply didn't need to advance any required array. In
+ * both cases we can safely assume that the _first_ tuple from this page
+ * must be >= the current set of array keys/equality constraints. And so
+ * if the final tuple is == those same keys (and also satisfies any
+ * required < or <= strategy scan keys) during the precheck, we can safely
+ * assume that this must also be true of all earlier tuples from the page.
*/
- if (!firstPage && minoff < maxoff)
+ if (!firstPage && !so->scanBehind && minoff < maxoff)
{
ItemId iid;
IndexTuple itup;
iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
itup = (IndexTuple) PageGetItem(page, iid);
- /*
- * Do the precheck. Note that we pass the pointer to the
- * 'continuescanPrechecked' to the 'continuescan' argument. That will
- * set flag to true if all required keys are satisfied and false
- * otherwise.
- */
- (void) _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescanPrechecked, false, false);
- }
- else
- {
- continuescanPrechecked = false;
+ /* Call with arrayKeys=false to avoid undesirable side-effects */
+ _bt_checkkeys(scan, &pstate, false, itup, indnatts);
+ pstate.prechecked = pstate.continuescan;
+ pstate.continuescan = true; /* reset */
}
if (ScanDirectionIsForward(dir))
{
+ /* SK_SEARCHARRAY forward scans must provide high key up front */
+ if (arrayKeys && !P_RIGHTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, P_HIKEY);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in ascending order */
itemIndex = 0;
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum < pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
}
}
/* When !continuescan, there can't be any more matches, so stop */
- if (!continuescan)
+ if (!pstate.continuescan)
break;
offnum = OffsetNumberNext(offnum);
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
- if (continuescan && !P_RIGHTMOST(opaque))
+ if (pstate.continuescan && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
- _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
+ pstate.prechecked = false; /* precheck didn't cover HIKEY */
+ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
}
- if (!continuescan)
+ if (!pstate.continuescan)
so->currPos.moreRight = false;
Assert(itemIndex <= MaxTIDsPerBTreePage);
}
else
{
+ /* SK_SEARCHARRAY backward scans must provide final tuple up front */
+ if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, minoff);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in descending order */
itemIndex = MaxTIDsPerBTreePage;
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && !haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum > pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
}
}
}
- if (!continuescan)
+ if (!pstate.continuescan)
{
/* there can't be any more matches, so stop */
so->currPos.moreLeft = false;
so->currPos.nextTupleOffset);
so->markPos.itemIndex = so->markItemIndex;
so->markItemIndex = -1;
+
+ /*
+ * If we're just about to start the next primitive index scan
+ * (possible with a scan that has arrays keys, and needs to skip to
+ * continue in the current scan direction), moreLeft/moreRight only
+ * indicate the end of the current primitive index scan. They must
+ * never be taken to indicate that the top-level index scan has ended
+ * (that would be wrong).
+ *
+ * We could handle this case by treating the current array keys as
+ * markPos state. But depending on the current array state like this
+ * would add complexity. Instead, we just unset markPos's copy of
+ * moreRight or moreLeft (whichever might be affected), while making
+ * btrestpos reset the scan's arrays to their initial scan positions.
+ * In effect, btrestpos leaves advancing the arrays up to the first
+ * _bt_readpage call (that takes place after it has restored markPos).
+ */
+ Assert(so->markPos.dir == dir);
+ if (so->needPrimScan)
+ {
+ if (ScanDirectionIsForward(dir))
+ so->markPos.moreRight = true;
+ else
+ so->markPos.moreLeft = true;
+ }
}
if (ScanDirectionIsForward(dir))
* Seize the scan to get the next block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
/* release the previous buffer, if pinned */
* Seize the scan to get the current block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
BTScanPosUnpinIfPinned(so->currPos);
if (!status)
{
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Assert(!so->needPrimScan);
+
_bt_initialize_more_data(so, dir);
if (!_bt_readnextpage(scan, blkno, dir))
}
/*
- * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
- * for scan direction
+ * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
+ * from currPos
*/
static inline void
_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
{
- /* initialize moreLeft/moreRight appropriately for scan direction */
- if (ScanDirectionIsForward(dir))
+ so->currPos.dir = dir;
+ if (so->needPrimScan)
+ {
+ Assert(so->numArrayKeys);
+
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = true;
+ so->needPrimScan = false;
+ }
+ else if (ScanDirectionIsForward(dir))
{
so->currPos.moreLeft = false;
so->currPos.moreRight = true;
#include "utils/memutils.h"
#include "utils/rel.h"
+#define LOOK_AHEAD_REQUIRED_RECHECKS 3
+#define LOOK_AHEAD_DEFAULT_DISTANCE 5
typedef struct BTSortArrayContext
{
- FmgrInfo flinfo;
+ FmgrInfo *sortproc;
Oid collation;
bool reverse;
} BTSortArrayContext;
+typedef struct BTScanKeyPreproc
+{
+ ScanKey skey;
+ int ikey;
+ int arrayidx;
+} BTScanKeyPreproc;
+
+static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+ FmgrInfo *orderproc, FmgrInfo **sortprocp);
static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
- StrategyNumber strat,
+ Oid elemtype, StrategyNumber strat,
Datum *elems, int nelems);
-static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
- bool reverse,
- Datum *elems, int nelems);
+static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
+ bool reverse, Datum *elems, int nelems);
+static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
+ FmgrInfo *sortproc, bool reverse,
+ Oid origelemtype, Oid nextelemtype,
+ Datum *elems_orig, int *nelems_orig,
+ Datum *elems_next, int nelems_next);
+static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
+ ScanKey arraysk, ScanKey skey,
+ FmgrInfo *orderproc, BTArrayKeyInfo *array,
+ bool *qual_ok);
+static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan);
+static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
+static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
+ Datum tupdatum, bool tupnull,
+ Datum arrdatum, ScanKey cur);
+static int _bt_binsrch_array_skey(FmgrInfo *orderproc,
+ bool cur_elem_trig, ScanDirection dir,
+ Datum tupdatum, bool tupnull,
+ BTArrayKeyInfo *array, ScanKey cur,
+ int32 *set_elem_result);
+static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
+static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+ bool readpagetup, int sktrig, bool *scanBehind);
+static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ int sktrig, bool sktrig_required);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
+#endif
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
ScanKey leftarg, ScanKey rightarg,
+ BTArrayKeyInfo *array, FmgrInfo *orderproc,
bool *result);
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ bool advancenonrequired, bool prechecked, bool firstmatch,
+ bool *continuescan, int *ikey);
static bool _bt_check_rowcompare(ScanKey skey,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan);
+static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+ int tupnatts, TupleDesc tupdesc);
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
IndexTuple firstright, BTScanInsert itup_key);
*
* If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
* set up BTArrayKeyInfo info for each one that is an equality-type key.
- * Prepare modified scan keys in so->arrayKeyData, which will hold the current
- * array elements during each primitive indexscan operation. For inequality
- * array keys, it's sufficient to find the extreme element value and replace
- * the whole array with that scalar value.
- *
- * Note: the reason we need so->arrayKeyData, rather than just scribbling
- * on scan->keyData, is that callers are permitted to call btrescan without
- * supplying a new set of scankey data.
+ * Returns modified scan keys as input for further, standard preprocessing.
+ *
+ * Currently we perform two kinds of preprocessing to deal with redundancies.
+ * For inequality array keys, it's sufficient to find the extreme element
+ * value and replace the whole array with that scalar value. This eliminates
+ * all but one array element as redundant. Similarly, we are capable of
+ * "merging together" multiple equality array keys (from two or more input
+ * scan keys) into a single output scan key containing only the intersecting
+ * array elements. This can eliminate many redundant array elements, as well
+ * as eliminating whole array scan keys as redundant. It can also allow us to
+ * detect contradictory quals.
+ *
+ * It is convenient for _bt_preprocess_keys caller to have to deal with no
+ * more than one equality strategy array scan key per index attribute. We'll
+ * always be able to set things up that way when complete opfamilies are used.
+ * Eliminated array scan keys can be recognized as those that have had their
+ * sk_strategy field set to InvalidStrategy here by us. Caller should avoid
+ * including these in the scan's so->keyData[] output array.
+ *
+ * We set the scan key references from the scan's BTArrayKeyInfo info array to
+ * offsets into the temp modified input array returned to caller. Scans that
+ * have array keys should call _bt_preprocess_array_keys_final when standard
+ * preprocessing steps are complete. This will convert the scan key offset
+ * references into references to the scan's so->keyData[] output scan keys.
+ *
+ * Note: the reason we need to return a temp scan key array, rather than just
+ * scribbling on scan->keyData, is that callers are permitted to call btrescan
+ * without supplying a new set of scankey data.
*/
-void
+static ScanKey
_bt_preprocess_array_keys(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
int numberOfKeys = scan->numberOfKeys;
- int16 *indoption = scan->indexRelation->rd_indoption;
+ int16 *indoption = rel->rd_indoption;
int numArrayKeys;
+ int origarrayatt = InvalidAttrNumber,
+ origarraykey = -1;
+ Oid origelemtype = InvalidOid;
ScanKey cur;
- int i;
MemoryContext oldContext;
+ ScanKey arrayKeyData; /* modified copy of scan->keyData */
+
+ Assert(numberOfKeys);
/* Quick check to see if there are any array keys */
numArrayKeys = 0;
- for (i = 0; i < numberOfKeys; i++)
+ for (int i = 0; i < numberOfKeys; i++)
{
cur = &scan->keyData[i];
if (cur->sk_flags & SK_SEARCHARRAY)
/* If any arrays are null as a whole, we can quit right now. */
if (cur->sk_flags & SK_ISNULL)
{
- so->numArrayKeys = -1;
- so->arrayKeyData = NULL;
- return;
+ so->qual_ok = false;
+ return NULL;
}
}
}
/* Quit if nothing to do. */
if (numArrayKeys == 0)
- {
- so->numArrayKeys = 0;
- so->arrayKeyData = NULL;
- return;
- }
+ return NULL;
/*
* Make a scan-lifespan context to hold array-associated data, or reset it
oldContext = MemoryContextSwitchTo(so->arrayContext);
/* Create modifiable copy of scan->keyData in the workspace context */
- so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
- memcpy(so->arrayKeyData,
- scan->keyData,
- scan->numberOfKeys * sizeof(ScanKeyData));
+ arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
+ memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData));
/* Allocate space for per-array data in the workspace context */
- so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+ /* Allocate space for ORDER procs used to help _bt_checkkeys */
+ so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
/* Now process each array key */
numArrayKeys = 0;
- for (i = 0; i < numberOfKeys; i++)
+ for (int i = 0; i < numberOfKeys; i++)
{
+ FmgrInfo sortproc;
+ FmgrInfo *sortprocp = &sortproc;
+ Oid elemtype;
+ bool reverse;
ArrayType *arrayval;
int16 elmlen;
bool elmbyval;
int num_nonnulls;
int j;
- cur = &so->arrayKeyData[i];
+ cur = &arrayKeyData[i];
if (!(cur->sk_flags & SK_SEARCHARRAY))
continue;
/* If there's no non-nulls, the scan qual is unsatisfiable */
if (num_nonnulls == 0)
{
- numArrayKeys = -1;
+ so->qual_ok = false;
break;
}
+ /*
+ * Determine the nominal datatype of the array elements. We have to
+ * support the convention that sk_subtype == InvalidOid means the
+ * opclass input type; this is a hack to simplify life for
+ * ScanKeyInit().
+ */
+ elemtype = cur->sk_subtype;
+ if (elemtype == InvalidOid)
+ elemtype = rel->rd_opcintype[cur->sk_attno - 1];
+ Assert(elemtype == ARR_ELEMTYPE(arrayval));
+
/*
* If the comparison operator is not equality, then the array qual
* degenerates to a simple comparison against the smallest or largest
case BTLessStrategyNumber:
case BTLessEqualStrategyNumber:
cur->sk_argument =
- _bt_find_extreme_element(scan, cur,
+ _bt_find_extreme_element(scan, cur, elemtype,
BTGreaterStrategyNumber,
elem_values, num_nonnulls);
continue;
case BTGreaterEqualStrategyNumber:
case BTGreaterStrategyNumber:
cur->sk_argument =
- _bt_find_extreme_element(scan, cur,
+ _bt_find_extreme_element(scan, cur, elemtype,
BTLessStrategyNumber,
elem_values, num_nonnulls);
continue;
break;
}
+ /*
+ * We'll need a 3-way ORDER proc to perform binary searches for the
+ * next matching array element. Set that up now.
+ *
+ * Array scan keys with cross-type equality operators will require a
+ * separate same-type ORDER proc for sorting their array. Otherwise,
+ * sortproc just points to the same proc used during binary searches.
+ */
+ _bt_setup_array_cmp(scan, cur, elemtype,
+ &so->orderProcs[i], &sortprocp);
+
/*
* Sort the non-null elements and eliminate any duplicates. We must
* sort in the same ordering used by the index column, so that the
- * successive primitive indexscans produce data in index order.
+ * arrays can be advanced in lockstep with the scan's progress through
+ * the index's key space.
*/
- num_elems = _bt_sort_array_elements(scan, cur,
- (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+ reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
+ num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
elem_values, num_nonnulls);
+ if (origarrayatt == cur->sk_attno)
+ {
+ BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
+
+ /*
+ * This array scan key is redundant with a previous equality
+ * operator array scan key. Merge the two arrays together to
+ * eliminate contradictory non-intersecting elements (or try to).
+ *
+ * We merge this next array back into attribute's original array.
+ */
+ Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
+ Assert(arrayKeyData[orig->scan_key].sk_collation ==
+ cur->sk_collation);
+ if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
+ origelemtype, elemtype,
+ orig->elem_values, &orig->num_elems,
+ elem_values, num_elems))
+ {
+ /* Successfully eliminated this array */
+ pfree(elem_values);
+
+