From 79fa7b3b1a449680d9e51624834fbb4b32208659 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 16 Oct 2024 12:17:49 -0400 Subject: [PATCH] Normalize nbtree truncated high key array behavior. Commit 5bf748b8 taught nbtree ScalarArrayOp index scans to decide when and how to start the next primitive index scan based on physical index characteristics. This included rules for deciding whether to start a new primitive index scan (or whether to move onto the right sibling leaf page instead) that specifically consider truncated lower-order columns (-inf columns) from leaf page high keys. These omitted columns were treated as satisfying the scan's required scan keys, though only for scan keys marked required in the current scan direction (forward). Scan keys that didn't get this behavior (those marked required in the backwards direction only) usually didn't give the scan reasonable cause to reposition itself to a later leaf page (via another descent of the index in _bt_first), but _bt_advance_array_keys would nevertheless always give up by forcing another call to _bt_first. _bt_advance_array_keys was unwilling to allow the scan to continue onto the next leaf page, to reconsider whether we really should start another primitive scan based on the details of the sibling page's tuples. This didn't match its behavior with similar cases involving keys required in the current scan direction (forward), which seems unprincipled. It led to an excessive number of primitive scans/index descents for queries with a higher-order = array scan key (with dense, contiguous values) mixed with a lower-order required > or >= scan key. Bring > and >= strategy scan keys in line with other required scan key types: treat truncated -inf scan keys as having satisfied scan keys required in either scan direction (forwards and backwards alike) during array advancement. That way affected scans can continue to the right sibling leaf page. Advancement must now schedule an explicit recheck of the right sibling page's high key in cases involving > or >= scan keys. The recheck gives the scan a way to back out and start another primitive index scan (we can't just rely on _bt_checkkeys with > or >= scan keys). This work can be considered a stand alone optimization on top of the work from commit 5bf748b8. But it was written in preparation for an upcoming patch that will add skip scan to nbtree. In practice scans that use "skip arrays" will tend to be much more sensitive to any implementation deficiencies in this area. Author: Peter Geoghegan Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-Wz=9A_UtM7HzUThSkQ+BcrQsQZuNhWOvQWK06PRkEp=SKQ@mail.gmail.com --- src/backend/access/nbtree/nbtree.c | 4 + src/backend/access/nbtree/nbtsearch.c | 25 ++++++ src/backend/access/nbtree/nbtutils.c | 119 ++++++++++++++------------ src/include/access/nbtree.h | 3 + 4 files changed, 95 insertions(+), 56 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 56e502c4fc..9b968aa0f2 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -332,6 +332,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->needPrimScan = false; so->scanBehind = false; + so->oppositeDirCheck = false; so->arrayKeys = NULL; so->orderProcs = NULL; so->arrayContext = NULL; @@ -375,6 +376,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; + so->oppositeDirCheck = false; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -618,6 +620,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) */ so->needPrimScan = false; so->scanBehind = false; + so->oppositeDirCheck = false; } else { @@ -676,6 +679,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) */ so->needPrimScan = true; so->scanBehind = false; + so->oppositeDirCheck = false; } else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING) { diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fff7c89ead..91ac6533f6 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1703,6 +1703,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, ItemId iid = PageGetItemId(page, P_HIKEY); pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (unlikely(so->oppositeDirCheck)) + { + Assert(so->scanBehind); + + /* + * Last _bt_readpage call scheduled a recheck of finaltup for + * required scan keys up to and including a > or >= scan key. + * + * _bt_checkkeys won't consider the scanBehind flag unless the + * scan is stoppped by a scan key required in the current scan + * direction. We need this recheck so that we'll notice when + * all tuples on this page are still before the _bt_first-wise + * start of matches for the current set of array keys. + */ + if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreRight = false; + so->needPrimScan = true; + return false; + } + + /* Deliberately don't unset scanBehind flag just yet */ + } } /* load items[] in ascending order */ diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index b4ba51357a..61ded00ca2 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1371,7 +1371,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) curArrayKey->cur_elem = 0; skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; } - so->scanBehind = false; + so->scanBehind = so->oppositeDirCheck = false; /* reset */ } /* @@ -1680,8 +1680,7 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) Assert(so->numArrayKeys); - /* scanBehind flag doesn't persist across primitive index scans - reset */ - so->scanBehind = false; + so->scanBehind = so->oppositeDirCheck = false; /* reset */ /* * Array keys are advanced within _bt_checkkeys when the scan reaches the @@ -1817,7 +1816,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL)); - so->scanBehind = false; /* reset */ + so->scanBehind = so->oppositeDirCheck = false; /* reset */ /* * Required scan key wasn't satisfied, so required arrays will have to @@ -2302,19 +2301,22 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (so->scanBehind && has_required_opposite_direction_only) { /* - * However, we avoid this behavior whenever the scan involves a scan + * However, we need to work harder whenever the scan involves a scan * key required in the opposite direction to the scan only, along with * a finaltup with at least one truncated attribute that's associated * with a scan key marked required (required in either direction). * * _bt_check_compare simply won't stop the scan for a scan key that's * marked required in the opposite scan direction only. That leaves - * us without any reliable way of reconsidering any opposite-direction + * us without an automatic way of reconsidering any opposite-direction * inequalities if it turns out that starting a new primitive index - * scan will allow _bt_first to skip ahead by a great many leaf pages - * (see next section for details of how that works). + * scan will allow _bt_first to skip ahead by a great many leaf pages. + * + * We deal with this by explicitly scheduling a finaltup recheck on + * the right sibling page. _bt_readpage calls _bt_oppodir_checkkeys + * for next page's finaltup (and we skip it for this page's finaltup). */ - goto new_prim_scan; + so->oppositeDirCheck = true; /* recheck next page's high key */ } /* @@ -2345,61 +2347,24 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * similar "deduce NOT NULL" rule, where it finishes its insertion scan * key by consing up an explicit SK_SEARCHNOTNULL key.) * - * Apply a test against finaltup to detect and recover from these problem: + * Apply a test against finaltup to detect and recover from the problem: * if even finaltup doesn't satisfy such an inequality, we just skip by * starting a new primitive index scan. When we skip, we know for sure * that all of the tuples on the current page following caller's tuple are * also before the _bt_first-wise start of tuples for our new qual. That * at least suggests many more skippable pages beyond the current page. + * (when so->oppositeDirCheck was set, this'll happen on the next page.) */ - if (has_required_opposite_direction_only && pstate->finaltup && - (all_required_satisfied || oppodir_inequality_sktrig)) + else if (has_required_opposite_direction_only && pstate->finaltup && + (all_required_satisfied || oppodir_inequality_sktrig) && + unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) { - int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel); - ScanDirection flipped; - bool continuescanflip; - int opsktrig; - /* - * We're checking finaltup (which is usually not caller's tuple), so - * cannot reuse work from caller's earlier _bt_check_compare call. - * - * Flip the scan direction when calling _bt_check_compare this time, - * so that it will set continuescanflip=false when it encounters an - * inequality required in the opposite scan direction. + * Make sure that any non-required arrays are set to the first array + * element for the current scan direction */ - Assert(!so->scanBehind); - opsktrig = 0; - flipped = -dir; - _bt_check_compare(scan, flipped, - pstate->finaltup, nfinaltupatts, tupdesc, - false, false, false, - &continuescanflip, &opsktrig); - - /* - * Only start a new primitive index scan when finaltup has a required - * unsatisfied inequality (unsatisfied in the opposite direction) - */ - Assert(all_required_satisfied != oppodir_inequality_sktrig); - if (unlikely(!continuescanflip && - so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber)) - { - /* - * It's possible for the same inequality to be unsatisfied by both - * caller's tuple (in scan's direction) and finaltup (in the - * opposite direction) due to _bt_check_compare's behavior with - * NULLs - */ - Assert(opsktrig >= sktrig); /* not opsktrig > sktrig due to NULLs */ - - /* - * Make sure that any non-required arrays are set to the first - * array element for the current scan direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); - - goto new_prim_scan; - } + _bt_rewind_nonrequired_arrays(scan, dir); + goto new_prim_scan; } /* @@ -3511,7 +3476,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * * Assert that the scan isn't in danger of becoming confused. */ - Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch); + Assert(!so->scanBehind && !so->oppositeDirCheck); + Assert(!pstate->prechecked && !pstate->firstmatch); Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL)); } @@ -3623,6 +3589,47 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, ikey, true); } +/* + * Test whether an indextuple fails to satisfy an inequality required in the + * opposite direction only. + * + * Caller's finaltup tuple is the page high key (for forwards scans), or the + * first non-pivot tuple (for backwards scans). Called during scans with + * required array keys and required opposite-direction inequalities. + * + * Returns false if an inequality scan key required in the opposite direction + * only isn't satisfied (and any earlier required scan keys are satisfied). + * Otherwise returns true. + * + * An unsatisfied inequality required in the opposite direction only might + * well enable skipping over many leaf pages, provided another _bt_first call + * takes place. This type of unsatisfied inequality won't usually cause + * _bt_checkkeys to stop the scan to consider array advancement/starting a new + * primitive index scan. + */ +bool +_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup) +{ + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + bool continuescan; + ScanDirection flipped = -dir; + int ikey = 0; + + Assert(so->numArrayKeys); + + _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, + false, false, false, &continuescan, &ikey); + + if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) + return false; + + return true; +} + /* * Test whether an indextuple satisfies current scan condition. * diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d64300fb97..90a68375a2 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1048,6 +1048,7 @@ typedef struct BTScanOpaqueData int numArrayKeys; /* number of equality-type array keys */ bool needPrimScan; /* New prim scan to continue in current dir? */ bool scanBehind; /* Last array advancement matched -inf attr? */ + bool oppositeDirCheck; /* explicit scanBehind recheck needed? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ @@ -1289,6 +1290,8 @@ extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); +extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); -- 2.30.2