LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtree.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 484 556 87.1 %
Date: 2025-05-02 18:15:31 Functions: 26 28 92.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtree.c
       4             :  *    Implementation of Lehman and Yao's btree management algorithm for
       5             :  *    Postgres.
       6             :  *
       7             :  * NOTES
       8             :  *    This file contains only the public interface routines.
       9             :  *
      10             :  *
      11             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      12             :  * Portions Copyright (c) 1994, Regents of the University of California
      13             :  *
      14             :  * IDENTIFICATION
      15             :  *    src/backend/access/nbtree/nbtree.c
      16             :  *
      17             :  *-------------------------------------------------------------------------
      18             :  */
      19             : #include "postgres.h"
      20             : 
      21             : #include "access/nbtree.h"
      22             : #include "access/relscan.h"
      23             : #include "access/stratnum.h"
      24             : #include "commands/progress.h"
      25             : #include "commands/vacuum.h"
      26             : #include "nodes/execnodes.h"
      27             : #include "pgstat.h"
      28             : #include "storage/bulk_write.h"
      29             : #include "storage/condition_variable.h"
      30             : #include "storage/indexfsm.h"
      31             : #include "storage/ipc.h"
      32             : #include "storage/lmgr.h"
      33             : #include "storage/read_stream.h"
      34             : #include "utils/datum.h"
      35             : #include "utils/fmgrprotos.h"
      36             : #include "utils/index_selfuncs.h"
      37             : #include "utils/memutils.h"
      38             : 
      39             : 
      40             : /*
      41             :  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
      42             :  *
      43             :  * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
      44             :  * scan to advance it via another call to _bt_first.
      45             :  *
      46             :  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
      47             :  * a new page; others must wait.
      48             :  *
      49             :  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
      50             :  * to a new page; some process can start doing that.
      51             :  *
      52             :  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
      53             :  */
      54             : typedef enum
      55             : {
      56             :     BTPARALLEL_NOT_INITIALIZED,
      57             :     BTPARALLEL_NEED_PRIMSCAN,
      58             :     BTPARALLEL_ADVANCING,
      59             :     BTPARALLEL_IDLE,
      60             :     BTPARALLEL_DONE,
      61             : } BTPS_State;
      62             : 
      63             : /*
      64             :  * BTParallelScanDescData contains btree specific shared information required
      65             :  * for parallel scan.
      66             :  */
      67             : typedef struct BTParallelScanDescData
      68             : {
      69             :     BlockNumber btps_nextScanPage;  /* next page to be scanned */
      70             :     BlockNumber btps_lastCurrPage;  /* page whose sibling link was copied into
      71             :                                      * btps_nextScanPage */
      72             :     BTPS_State  btps_pageStatus;    /* indicates whether next page is
      73             :                                      * available for scan. see above for
      74             :                                      * possible states of parallel scan. */
      75             :     LWLock      btps_lock;      /* protects shared parallel state */
      76             :     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
      77             : 
      78             :     /*
      79             :      * btps_arrElems is used when scans need to schedule another primitive
      80             :      * index scan with one or more SAOP arrays.  Holds BTArrayKeyInfo.cur_elem
      81             :      * offsets for each = scan key associated with a ScalarArrayOp array.
      82             :      */
      83             :     int         btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
      84             : 
      85             :     /*
      86             :      * Additional space (at the end of the struct) is used when scans need to
      87             :      * schedule another primitive index scan with one or more skip arrays.
      88             :      * Holds a flattened datum representation for each = scan key associated
      89             :      * with a skip array.
      90             :      */
      91             : }           BTParallelScanDescData;
      92             : 
      93             : typedef struct BTParallelScanDescData *BTParallelScanDesc;
      94             : 
      95             : 
      96             : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
      97             :                                           BTScanOpaque so);
      98             : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
      99             :                                         BTScanOpaque so);
     100             : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     101             :                          IndexBulkDeleteCallback callback, void *callback_state,
     102             :                          BTCycleId cycleid);
     103             : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
     104             : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
     105             :                                           IndexTuple posting,
     106             :                                           OffsetNumber updatedoffset,
     107             :                                           int *nremaining);
     108             : 
     109             : 
     110             : /*
     111             :  * Btree handler function: return IndexAmRoutine with access method parameters
     112             :  * and callbacks.
     113             :  */
     114             : Datum
     115     3164704 : bthandler(PG_FUNCTION_ARGS)
     116             : {
     117     3164704 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
     118             : 
     119     3164704 :     amroutine->amstrategies = BTMaxStrategyNumber;
     120     3164704 :     amroutine->amsupport = BTNProcs;
     121     3164704 :     amroutine->amoptsprocnum = BTOPTIONS_PROC;
     122     3164704 :     amroutine->amcanorder = true;
     123     3164704 :     amroutine->amcanorderbyop = false;
     124     3164704 :     amroutine->amcanhash = false;
     125     3164704 :     amroutine->amconsistentequality = true;
     126     3164704 :     amroutine->amconsistentordering = true;
     127     3164704 :     amroutine->amcanbackward = true;
     128     3164704 :     amroutine->amcanunique = true;
     129     3164704 :     amroutine->amcanmulticol = true;
     130     3164704 :     amroutine->amoptionalkey = true;
     131     3164704 :     amroutine->amsearcharray = true;
     132     3164704 :     amroutine->amsearchnulls = true;
     133     3164704 :     amroutine->amstorage = false;
     134     3164704 :     amroutine->amclusterable = true;
     135     3164704 :     amroutine->ampredlocks = true;
     136     3164704 :     amroutine->amcanparallel = true;
     137     3164704 :     amroutine->amcanbuildparallel = true;
     138     3164704 :     amroutine->amcaninclude = true;
     139     3164704 :     amroutine->amusemaintenanceworkmem = false;
     140     3164704 :     amroutine->amsummarizing = false;
     141     3164704 :     amroutine->amparallelvacuumoptions =
     142             :         VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
     143     3164704 :     amroutine->amkeytype = InvalidOid;
     144             : 
     145     3164704 :     amroutine->ambuild = btbuild;
     146     3164704 :     amroutine->ambuildempty = btbuildempty;
     147     3164704 :     amroutine->aminsert = btinsert;
     148     3164704 :     amroutine->aminsertcleanup = NULL;
     149     3164704 :     amroutine->ambulkdelete = btbulkdelete;
     150     3164704 :     amroutine->amvacuumcleanup = btvacuumcleanup;
     151     3164704 :     amroutine->amcanreturn = btcanreturn;
     152     3164704 :     amroutine->amcostestimate = btcostestimate;
     153     3164704 :     amroutine->amgettreeheight = btgettreeheight;
     154     3164704 :     amroutine->amoptions = btoptions;
     155     3164704 :     amroutine->amproperty = btproperty;
     156     3164704 :     amroutine->ambuildphasename = btbuildphasename;
     157     3164704 :     amroutine->amvalidate = btvalidate;
     158     3164704 :     amroutine->amadjustmembers = btadjustmembers;
     159     3164704 :     amroutine->ambeginscan = btbeginscan;
     160     3164704 :     amroutine->amrescan = btrescan;
     161     3164704 :     amroutine->amgettuple = btgettuple;
     162     3164704 :     amroutine->amgetbitmap = btgetbitmap;
     163     3164704 :     amroutine->amendscan = btendscan;
     164     3164704 :     amroutine->ammarkpos = btmarkpos;
     165     3164704 :     amroutine->amrestrpos = btrestrpos;
     166     3164704 :     amroutine->amestimateparallelscan = btestimateparallelscan;
     167     3164704 :     amroutine->aminitparallelscan = btinitparallelscan;
     168     3164704 :     amroutine->amparallelrescan = btparallelrescan;
     169     3164704 :     amroutine->amtranslatestrategy = bttranslatestrategy;
     170     3164704 :     amroutine->amtranslatecmptype = bttranslatecmptype;
     171             : 
     172     3164704 :     PG_RETURN_POINTER(amroutine);
     173             : }
     174             : 
     175             : /*
     176             :  *  btbuildempty() -- build an empty btree index in the initialization fork
     177             :  */
     178             : void
     179         164 : btbuildempty(Relation index)
     180             : {
     181         164 :     bool        allequalimage = _bt_allequalimage(index, false);
     182             :     BulkWriteState *bulkstate;
     183             :     BulkWriteBuffer metabuf;
     184             : 
     185         164 :     bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
     186             : 
     187             :     /* Construct metapage. */
     188         164 :     metabuf = smgr_bulk_get_buf(bulkstate);
     189         164 :     _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
     190         164 :     smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
     191             : 
     192         164 :     smgr_bulk_finish(bulkstate);
     193         164 : }
     194             : 
     195             : /*
     196             :  *  btinsert() -- insert an index tuple into a btree.
     197             :  *
     198             :  *      Descend the tree recursively, find the appropriate location for our
     199             :  *      new tuple, and put it there.
     200             :  */
     201             : bool
     202     7218114 : btinsert(Relation rel, Datum *values, bool *isnull,
     203             :          ItemPointer ht_ctid, Relation heapRel,
     204             :          IndexUniqueCheck checkUnique,
     205             :          bool indexUnchanged,
     206             :          IndexInfo *indexInfo)
     207             : {
     208             :     bool        result;
     209             :     IndexTuple  itup;
     210             : 
     211             :     /* generate an index tuple */
     212     7218114 :     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
     213     7218114 :     itup->t_tid = *ht_ctid;
     214             : 
     215     7218114 :     result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
     216             : 
     217     7217602 :     pfree(itup);
     218             : 
     219     7217602 :     return result;
     220             : }
     221             : 
     222             : /*
     223             :  *  btgettuple() -- Get the next tuple in the scan.
     224             :  */
     225             : bool
     226    34429268 : btgettuple(IndexScanDesc scan, ScanDirection dir)
     227             : {
     228    34429268 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     229             :     bool        res;
     230             : 
     231             :     /* btree indexes are never lossy */
     232    34429268 :     scan->xs_recheck = false;
     233             : 
     234             :     /* Each loop iteration performs another primitive index scan */
     235             :     do
     236             :     {
     237             :         /*
     238             :          * If we've already initialized this scan, we can just advance it in
     239             :          * the appropriate direction.  If we haven't done so yet, we call
     240             :          * _bt_first() to get the first item in the scan.
     241             :          */
     242    34446172 :         if (!BTScanPosIsValid(so->currPos))
     243    15233746 :             res = _bt_first(scan, dir);
     244             :         else
     245             :         {
     246             :             /*
     247             :              * Check to see if we should kill the previously-fetched tuple.
     248             :              */
     249    19212426 :             if (scan->kill_prior_tuple)
     250             :             {
     251             :                 /*
     252             :                  * Yes, remember it for later. (We'll deal with all such
     253             :                  * tuples at once right before leaving the index page.)  The
     254             :                  * test for numKilled overrun is not just paranoia: if the
     255             :                  * caller reverses direction in the indexscan then the same
     256             :                  * item might get entered multiple times. It's not worth
     257             :                  * trying to optimize that, so we don't detect it, but instead
     258             :                  * just forget any excess entries.
     259             :                  */
     260      423542 :                 if (so->killedItems == NULL)
     261      166664 :                     so->killedItems = (int *)
     262      166664 :                         palloc(MaxTIDsPerBTreePage * sizeof(int));
     263      423542 :                 if (so->numKilled < MaxTIDsPerBTreePage)
     264      423542 :                     so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     265             :             }
     266             : 
     267             :             /*
     268             :              * Now continue the scan.
     269             :              */
     270    19212426 :             res = _bt_next(scan, dir);
     271             :         }
     272             : 
     273             :         /* If we have a tuple, return it ... */
     274    34446172 :         if (res)
     275    27515288 :             break;
     276             :         /* ... otherwise see if we need another primitive index scan */
     277     6930884 :     } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
     278             : 
     279    34429268 :     return res;
     280             : }
     281             : 
     282             : /*
     283             :  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
     284             :  */
     285             : int64
     286       13992 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     287             : {
     288       13992 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     289       13992 :     int64       ntids = 0;
     290             :     ItemPointer heapTid;
     291             : 
     292             :     /* Each loop iteration performs another primitive index scan */
     293             :     do
     294             :     {
     295             :         /* Fetch the first page & tuple */
     296       14986 :         if (_bt_first(scan, ForwardScanDirection))
     297             :         {
     298             :             /* Save tuple ID, and continue scanning */
     299       10512 :             heapTid = &scan->xs_heaptid;
     300       10512 :             tbm_add_tuples(tbm, heapTid, 1, false);
     301       10512 :             ntids++;
     302             : 
     303             :             for (;;)
     304             :             {
     305             :                 /*
     306             :                  * Advance to next tuple within page.  This is the same as the
     307             :                  * easy case in _bt_next().
     308             :                  */
     309     2049984 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     310             :                 {
     311             :                     /* let _bt_next do the heavy lifting */
     312       16204 :                     if (!_bt_next(scan, ForwardScanDirection))
     313       10512 :                         break;
     314             :                 }
     315             : 
     316             :                 /* Save tuple ID, and continue scanning */
     317     2039472 :                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
     318     2039472 :                 tbm_add_tuples(tbm, heapTid, 1, false);
     319     2039472 :                 ntids++;
     320             :             }
     321             :         }
     322             :         /* Now see if we need another primitive index scan */
     323       14986 :     } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
     324             : 
     325       13992 :     return ntids;
     326             : }
     327             : 
     328             : /*
     329             :  *  btbeginscan() -- start a scan on a btree index
     330             :  */
     331             : IndexScanDesc
     332    14526764 : btbeginscan(Relation rel, int nkeys, int norderbys)
     333             : {
     334             :     IndexScanDesc scan;
     335             :     BTScanOpaque so;
     336             : 
     337             :     /* no order by operators allowed */
     338             :     Assert(norderbys == 0);
     339             : 
     340             :     /* get the scan */
     341    14526764 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     342             : 
     343             :     /* allocate private workspace */
     344    14526764 :     so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
     345    14526764 :     BTScanPosInvalidate(so->currPos);
     346    14526764 :     BTScanPosInvalidate(so->markPos);
     347    14526764 :     if (scan->numberOfKeys > 0)
     348    14513558 :         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     349             :     else
     350       13206 :         so->keyData = NULL;
     351             : 
     352    14526764 :     so->skipScan = false;
     353    14526764 :     so->needPrimScan = false;
     354    14526764 :     so->scanBehind = false;
     355    14526764 :     so->oppositeDirCheck = false;
     356    14526764 :     so->arrayKeys = NULL;
     357    14526764 :     so->orderProcs = NULL;
     358    14526764 :     so->arrayContext = NULL;
     359             : 
     360    14526764 :     so->killedItems = NULL;      /* until needed */
     361    14526764 :     so->numKilled = 0;
     362             : 
     363             :     /*
     364             :      * We don't know yet whether the scan will be index-only, so we do not
     365             :      * allocate the tuple workspace arrays until btrescan.  However, we set up
     366             :      * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     367             :      */
     368    14526764 :     so->currTuples = so->markTuples = NULL;
     369             : 
     370    14526764 :     scan->xs_itupdesc = RelationGetDescr(rel);
     371             : 
     372    14526764 :     scan->opaque = so;
     373             : 
     374    14526764 :     return scan;
     375             : }
     376             : 
     377             : /*
     378             :  *  btrescan() -- rescan an index relation
     379             :  */
     380             : void
     381    15232104 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     382             :          ScanKey orderbys, int norderbys)
     383             : {
     384    15232104 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     385             : 
     386             :     /* we aren't holding any read locks, but gotta drop the pins */
     387    15232104 :     if (BTScanPosIsValid(so->currPos))
     388             :     {
     389             :         /* Before leaving current page, deal with any killed items */
     390       96294 :         if (so->numKilled > 0)
     391        1064 :             _bt_killitems(scan);
     392       96294 :         BTScanPosUnpinIfPinned(so->currPos);
     393       96294 :         BTScanPosInvalidate(so->currPos);
     394             :     }
     395             : 
     396    15232104 :     so->markItemIndex = -1;
     397    15232104 :     so->needPrimScan = false;
     398    15232104 :     so->scanBehind = false;
     399    15232104 :     so->oppositeDirCheck = false;
     400    15232104 :     BTScanPosUnpinIfPinned(so->markPos);
     401    15232104 :     BTScanPosInvalidate(so->markPos);
     402             : 
     403             :     /*
     404             :      * Allocate tuple workspace arrays, if needed for an index-only scan and
     405             :      * not already done in a previous rescan call.  To save on palloc
     406             :      * overhead, both workspaces are allocated as one palloc block; only this
     407             :      * function and btendscan know that.
     408             :      *
     409             :      * NOTE: this data structure also makes it safe to return data from a
     410             :      * "name" column, even though btree name_ops uses an underlying storage
     411             :      * datatype of cstring.  The risk there is that "name" is supposed to be
     412             :      * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     413             :      * However, since we only return data out of tuples sitting in the
     414             :      * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     415             :      * data out of the markTuples array --- running off the end of memory for
     416             :      * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     417             :      * adding special-case treatment for name_ops elsewhere.
     418             :      */
     419    15232104 :     if (scan->xs_want_itup && so->currTuples == NULL)
     420             :     {
     421      135310 :         so->currTuples = (char *) palloc(BLCKSZ * 2);
     422      135310 :         so->markTuples = so->currTuples + BLCKSZ;
     423             :     }
     424             : 
     425             :     /*
     426             :      * Reset the scan keys
     427             :      */
     428    15232104 :     if (scankey && scan->numberOfKeys > 0)
     429    15218704 :         memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
     430    15232104 :     so->numberOfKeys = 0;        /* until _bt_preprocess_keys sets it */
     431    15232104 :     so->numArrayKeys = 0;        /* ditto */
     432    15232104 : }
     433             : 
     434             : /*
     435             :  *  btendscan() -- close down a scan
     436             :  */
     437             : void
     438    14525214 : btendscan(IndexScanDesc scan)
     439             : {
     440    14525214 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     441             : 
     442             :     /* we aren't holding any read locks, but gotta drop the pins */
     443    14525214 :     if (BTScanPosIsValid(so->currPos))
     444             :     {
     445             :         /* Before leaving current page, deal with any killed items */
     446     8205456 :         if (so->numKilled > 0)
     447       88832 :             _bt_killitems(scan);
     448     8205456 :         BTScanPosUnpinIfPinned(so->currPos);
     449             :     }
     450             : 
     451    14525214 :     so->markItemIndex = -1;
     452    14525214 :     BTScanPosUnpinIfPinned(so->markPos);
     453             : 
     454             :     /* No need to invalidate positions, the RAM is about to be freed. */
     455             : 
     456             :     /* Release storage */
     457    14525214 :     if (so->keyData != NULL)
     458    14512038 :         pfree(so->keyData);
     459             :     /* so->arrayKeys and so->orderProcs are in arrayContext */
     460    14525214 :     if (so->arrayContext != NULL)
     461        4848 :         MemoryContextDelete(so->arrayContext);
     462    14525214 :     if (so->killedItems != NULL)
     463      166598 :         pfree(so->killedItems);
     464    14525214 :     if (so->currTuples != NULL)
     465      135266 :         pfree(so->currTuples);
     466             :     /* so->markTuples should not be pfree'd, see btrescan */
     467    14525214 :     pfree(so);
     468    14525214 : }
     469             : 
     470             : /*
     471             :  *  btmarkpos() -- save current scan position
     472             :  */
     473             : void
     474      130074 : btmarkpos(IndexScanDesc scan)
     475             : {
     476      130074 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     477             : 
     478             :     /* There may be an old mark with a pin (but no lock). */
     479      130074 :     BTScanPosUnpinIfPinned(so->markPos);
     480             : 
     481             :     /*
     482             :      * Just record the current itemIndex.  If we later step to next page
     483             :      * before releasing the marked position, _bt_steppage makes a full copy of
     484             :      * the currPos struct in markPos.  If (as often happens) the mark is moved
     485             :      * before we leave the page, we don't have to do that work.
     486             :      */
     487      130074 :     if (BTScanPosIsValid(so->currPos))
     488      130074 :         so->markItemIndex = so->currPos.itemIndex;
     489             :     else
     490             :     {
     491           0 :         BTScanPosInvalidate(so->markPos);
     492           0 :         so->markItemIndex = -1;
     493             :     }
     494      130074 : }
     495             : 
     496             : /*
     497             :  *  btrestrpos() -- restore scan to last saved position
     498             :  */
     499             : void
     500       54018 : btrestrpos(IndexScanDesc scan)
     501             : {
     502       54018 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     503             : 
     504       54018 :     if (so->markItemIndex >= 0)
     505             :     {
     506             :         /*
     507             :          * The scan has never moved to a new page since the last mark.  Just
     508             :          * restore the itemIndex.
     509             :          *
     510             :          * NB: In this case we can't count on anything in so->markPos to be
     511             :          * accurate.
     512             :          */
     513       53910 :         so->currPos.itemIndex = so->markItemIndex;
     514             :     }
     515             :     else
     516             :     {
     517             :         /*
     518             :          * The scan moved to a new page after last mark or restore, and we are
     519             :          * now restoring to the marked page.  We aren't holding any read
     520             :          * locks, but if we're still holding the pin for the current position,
     521             :          * we must drop it.
     522             :          */
     523         108 :         if (BTScanPosIsValid(so->currPos))
     524             :         {
     525             :             /* Before leaving current page, deal with any killed items */
     526         108 :             if (so->numKilled > 0)
     527           0 :                 _bt_killitems(scan);
     528         108 :             BTScanPosUnpinIfPinned(so->currPos);
     529             :         }
     530             : 
     531         108 :         if (BTScanPosIsValid(so->markPos))
     532             :         {
     533             :             /* bump pin on mark buffer for assignment to current buffer */
     534         108 :             if (BTScanPosIsPinned(so->markPos))
     535           0 :                 IncrBufferRefCount(so->markPos.buf);
     536         108 :             memcpy(&so->currPos, &so->markPos,
     537             :                    offsetof(BTScanPosData, items[1]) +
     538         108 :                    so->markPos.lastItem * sizeof(BTScanPosItem));
     539         108 :             if (so->currTuples)
     540           0 :                 memcpy(so->currTuples, so->markTuples,
     541           0 :                        so->markPos.nextTupleOffset);
     542             :             /* Reset the scan's array keys (see _bt_steppage for why) */
     543         108 :             if (so->numArrayKeys)
     544             :             {
     545           0 :                 _bt_start_array_keys(scan, so->currPos.dir);
     546           0 :                 so->needPrimScan = false;
     547             :             }
     548             :         }
     549             :         else
     550           0 :             BTScanPosInvalidate(so->currPos);
     551             :     }
     552       54018 : }
     553             : 
     554             : /*
     555             :  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
     556             :  */
     557             : Size
     558          64 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
     559             : {
     560          64 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     561             :     Size        estnbtreeshared,
     562             :                 genericattrspace;
     563             : 
     564             :     /*
     565             :      * Pessimistically assume that every input scan key will be output with
     566             :      * its own SAOP array
     567             :      */
     568          64 :     estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
     569             :         sizeof(int) * nkeys;
     570             : 
     571             :     /* Single column indexes cannot possibly use a skip array */
     572          64 :     if (nkeyatts == 1)
     573          46 :         return estnbtreeshared;
     574             : 
     575             :     /*
     576             :      * Pessimistically assume that all attributes prior to the least
     577             :      * significant attribute require a skip array (and an associated key)
     578             :      */
     579          18 :     genericattrspace = datumEstimateSpace((Datum) 0, false, true,
     580             :                                           sizeof(Datum));
     581          36 :     for (int attnum = 1; attnum < nkeyatts; attnum++)
     582             :     {
     583             :         CompactAttribute *attr;
     584             : 
     585             :         /*
     586             :          * We make the conservative assumption that every index column will
     587             :          * also require a skip array.
     588             :          *
     589             :          * Every skip array must have space to store its scan key's sk_flags.
     590             :          */
     591          18 :         estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
     592             : 
     593             :         /* Consider space required to store a datum of opclass input type */
     594          18 :         attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
     595          18 :         if (attr->attbyval)
     596             :         {
     597             :             /* This index attribute stores pass-by-value datums */
     598          18 :             Size        estfixed = datumEstimateSpace((Datum) 0, false,
     599          18 :                                                       true, attr->attlen);
     600             : 
     601          18 :             estnbtreeshared = add_size(estnbtreeshared, estfixed);
     602          18 :             continue;
     603             :         }
     604             : 
     605             :         /*
     606             :          * This index attribute stores pass-by-reference datums.
     607             :          *
     608             :          * Assume that serializing this array will use just as much space as a
     609             :          * pass-by-value datum, in addition to space for the largest possible
     610             :          * whole index tuple (this is not just a per-datum portion of the
     611             :          * largest possible tuple because that'd be almost as large anyway).
     612             :          *
     613             :          * This is quite conservative, but it's not clear how we could do much
     614             :          * better.  The executor requires an up-front storage request size
     615             :          * that reliably covers the scan's high watermark memory usage.  We
     616             :          * can't be sure of the real high watermark until the scan is over.
     617             :          */
     618           0 :         estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
     619           0 :         estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
     620             :     }
     621             : 
     622          18 :     return estnbtreeshared;
     623             : }
     624             : 
     625             : /*
     626             :  * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
     627             :  *
     628             :  * Caller must have exclusively locked btscan->btps_lock when called.
     629             :  */
     630             : static void
     631          36 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
     632             :                               BTScanOpaque so)
     633             : {
     634             :     char       *datumshared;
     635             : 
     636             :     /* Space for serialized datums begins immediately after btps_arrElems[] */
     637          36 :     datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
     638          72 :     for (int i = 0; i < so->numArrayKeys; i++)
     639             :     {
     640          36 :         BTArrayKeyInfo *array = &so->arrayKeys[i];
     641          36 :         ScanKey     skey = &so->keyData[array->scan_key];
     642             : 
     643          36 :         if (array->num_elems != -1)
     644             :         {
     645             :             /* Save SAOP array's cur_elem (no need to copy key/datum) */
     646             :             Assert(!(skey->sk_flags & SK_BT_SKIP));
     647          36 :             btscan->btps_arrElems[i] = array->cur_elem;
     648          36 :             continue;
     649             :         }
     650             : 
     651             :         /* Save all mutable state associated with skip array's key */
     652             :         Assert(skey->sk_flags & SK_BT_SKIP);
     653           0 :         memcpy(datumshared, &skey->sk_flags, sizeof(int));
     654           0 :         datumshared += sizeof(int);
     655             : 
     656           0 :         if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
     657             :         {
     658             :             /* No sk_argument datum to serialize */
     659             :             Assert(skey->sk_argument == 0);
     660           0 :             continue;
     661             :         }
     662             : 
     663           0 :         datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
     664           0 :                        array->attbyval, array->attlen, &datumshared);
     665             :     }
     666          36 : }
     667             : 
     668             : /*
     669             :  * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
     670             :  *
     671             :  * Caller must have exclusively locked btscan->btps_lock when called.
     672             :  */
     673             : static void
     674          36 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
     675             :                             BTScanOpaque so)
     676             : {
     677             :     char       *datumshared;
     678             : 
     679             :     /* Space for serialized datums begins immediately after btps_arrElems[] */
     680          36 :     datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
     681          72 :     for (int i = 0; i < so->numArrayKeys; i++)
     682             :     {
     683          36 :         BTArrayKeyInfo *array = &so->arrayKeys[i];
     684          36 :         ScanKey     skey = &so->keyData[array->scan_key];
     685             :         bool        isnull;
     686             : 
     687          36 :         if (array->num_elems != -1)
     688             :         {
     689             :             /* Restore SAOP array using its saved cur_elem */
     690             :             Assert(!(skey->sk_flags & SK_BT_SKIP));
     691          36 :             array->cur_elem = btscan->btps_arrElems[i];
     692          36 :             skey->sk_argument = array->elem_values[array->cur_elem];
     693          36 :             continue;
     694             :         }
     695             : 
     696             :         /* Restore skip array by restoring its key directly */
     697           0 :         if (!array->attbyval && skey->sk_argument)
     698           0 :             pfree(DatumGetPointer(skey->sk_argument));
     699           0 :         skey->sk_argument = (Datum) 0;
     700           0 :         memcpy(&skey->sk_flags, datumshared, sizeof(int));
     701           0 :         datumshared += sizeof(int);
     702             : 
     703             :         Assert(skey->sk_flags & SK_BT_SKIP);
     704             : 
     705           0 :         if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
     706             :         {
     707             :             /* No sk_argument datum to restore */
     708           0 :             continue;
     709             :         }
     710             : 
     711           0 :         skey->sk_argument = datumRestore(&datumshared, &isnull);
     712           0 :         if (isnull)
     713             :         {
     714             :             Assert(skey->sk_argument == 0);
     715             :             Assert(skey->sk_flags & SK_SEARCHNULL);
     716             :             Assert(skey->sk_flags & SK_ISNULL);
     717             :         }
     718             :     }
     719          36 : }
     720             : 
     721             : /*
     722             :  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
     723             :  */
     724             : void
     725          64 : btinitparallelscan(void *target)
     726             : {
     727          64 :     BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
     728             : 
     729          64 :     LWLockInitialize(&bt_target->btps_lock,
     730             :                      LWTRANCHE_PARALLEL_BTREE_SCAN);
     731          64 :     bt_target->btps_nextScanPage = InvalidBlockNumber;
     732          64 :     bt_target->btps_lastCurrPage = InvalidBlockNumber;
     733          64 :     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     734          64 :     ConditionVariableInit(&bt_target->btps_cv);
     735          64 : }
     736             : 
     737             : /*
     738             :  *  btparallelrescan() -- reset parallel scan
     739             :  */
     740             : void
     741          24 : btparallelrescan(IndexScanDesc scan)
     742             : {
     743             :     BTParallelScanDesc btscan;
     744          24 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     745             : 
     746             :     Assert(parallel_scan);
     747             : 
     748          24 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     749             :                                                   parallel_scan->ps_offset_am);
     750             : 
     751             :     /*
     752             :      * In theory, we don't need to acquire the LWLock here, because there
     753             :      * shouldn't be any other workers running at this point, but we do so for
     754             :      * consistency.
     755             :      */
     756          24 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     757          24 :     btscan->btps_nextScanPage = InvalidBlockNumber;
     758          24 :     btscan->btps_lastCurrPage = InvalidBlockNumber;
     759          24 :     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     760          24 :     LWLockRelease(&btscan->btps_lock);
     761          24 : }
     762             : 
     763             : /*
     764             :  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
     765             :  *      page.  Other scans must wait until we call _bt_parallel_release()
     766             :  *      or _bt_parallel_done().
     767             :  *
     768             :  * The return value is true if we successfully seized the scan and false
     769             :  * if we did not.  The latter case occurs when no pages remain, or when
     770             :  * another primitive index scan is scheduled that caller's backend cannot
     771             :  * start just yet (only backends that call from _bt_first are capable of
     772             :  * starting primitive index scans, which they indicate by passing first=true).
     773             :  *
     774             :  * If the return value is true, *next_scan_page returns the next page of the
     775             :  * scan, and *last_curr_page returns the page that *next_scan_page came from.
     776             :  * An invalid *next_scan_page means the scan hasn't yet started, or that
     777             :  * caller needs to start the next primitive index scan (if it's the latter
     778             :  * case we'll set so.needPrimScan).
     779             :  *
     780             :  * Callers should ignore the value of *next_scan_page and *last_curr_page if
     781             :  * the return value is false.
     782             :  */
     783             : bool
     784        1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
     785             :                    BlockNumber *last_curr_page, bool first)
     786             : {
     787        1658 :     Relation    rel = scan->indexRelation;
     788        1658 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     789        1658 :     bool        exit_loop = false,
     790        1658 :                 status = true,
     791        1658 :                 endscan = false;
     792        1658 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     793             :     BTParallelScanDesc btscan;
     794             : 
     795        1658 :     *next_scan_page = InvalidBlockNumber;
     796        1658 :     *last_curr_page = InvalidBlockNumber;
     797             : 
     798             :     /*
     799             :      * Reset so->currPos, and initialize moreLeft/moreRight such that the next
     800             :      * call to _bt_readnextpage treats this backend similarly to a serial
     801             :      * backend that steps from *last_curr_page to *next_scan_page (unless this
     802             :      * backend's so->currPos is initialized by _bt_readfirstpage before then).
     803             :      */
     804        1658 :     BTScanPosInvalidate(so->currPos);
     805        1658 :     so->currPos.moreLeft = so->currPos.moreRight = true;
     806             : 
     807        1658 :     if (first)
     808             :     {
     809             :         /*
     810             :          * Initialize array related state when called from _bt_first, assuming
     811             :          * that this will be the first primitive index scan for the scan
     812             :          */
     813         446 :         so->needPrimScan = false;
     814         446 :         so->scanBehind = false;
     815         446 :         so->oppositeDirCheck = false;
     816             :     }
     817             :     else
     818             :     {
     819             :         /*
     820             :          * Don't attempt to seize the scan when it requires another primitive
     821             :          * index scan, since caller's backend cannot start it right now
     822             :          */
     823        1212 :         if (so->needPrimScan)
     824           0 :             return false;
     825             :     }
     826             : 
     827        1658 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     828             :                                                   parallel_scan->ps_offset_am);
     829             : 
     830             :     while (1)
     831             :     {
     832        1658 :         LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     833             : 
     834        1658 :         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
     835             :         {
     836             :             /* We're done with this parallel index scan */
     837         314 :             status = false;
     838             :         }
     839        1344 :         else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
     840        1220 :                  btscan->btps_nextScanPage == P_NONE)
     841             :         {
     842             :             /* End this parallel index scan */
     843           8 :             status = false;
     844           8 :             endscan = true;
     845             :         }
     846        1336 :         else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
     847             :         {
     848             :             Assert(so->numArrayKeys);
     849             : 
     850          36 :             if (first)
     851             :             {
     852             :                 /* Can start scheduled primitive scan right away, so do so */
     853          36 :                 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     854             : 
     855             :                 /* Restore scan's array keys from serialized values */
     856          36 :                 _bt_parallel_restore_arrays(rel, btscan, so);
     857          36 :                 exit_loop = true;
     858             :             }
     859             :             else
     860             :             {
     861             :                 /*
     862             :                  * Don't attempt to seize the scan when it requires another
     863             :                  * primitive index scan, since caller's backend cannot start
     864             :                  * it right now
     865             :                  */
     866           0 :                 status = false;
     867             :             }
     868             : 
     869             :             /*
     870             :              * Either way, update backend local state to indicate that a
     871             :              * pending primitive scan is required
     872             :              */
     873          36 :             so->needPrimScan = true;
     874          36 :             so->scanBehind = false;
     875          36 :             so->oppositeDirCheck = false;
     876             :         }
     877        1300 :         else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
     878             :         {
     879             :             /*
     880             :              * We have successfully seized control of the scan for the purpose
     881             :              * of advancing it to a new page!
     882             :              */
     883        1300 :             btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     884             :             Assert(btscan->btps_nextScanPage != P_NONE);
     885        1300 :             *next_scan_page = btscan->btps_nextScanPage;
     886        1300 :             *last_curr_page = btscan->btps_lastCurrPage;
     887        1300 :             exit_loop = true;
     888             :         }
     889        1658 :         LWLockRelease(&btscan->btps_lock);
     890        1658 :         if (exit_loop || !status)
     891             :             break;
     892           0 :         ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
     893             :     }
     894        1658 :     ConditionVariableCancelSleep();
     895             : 
     896             :     /* When the scan has reached the rightmost (or leftmost) page, end it */
     897        1658 :     if (endscan)
     898           8 :         _bt_parallel_done(scan);
     899             : 
     900        1658 :     return status;
     901             : }
     902             : 
     903             : /*
     904             :  * _bt_parallel_release() -- Complete the process of advancing the scan to a
     905             :  *      new page.  We now have the new value btps_nextScanPage; another backend
     906             :  *      can now begin advancing the scan.
     907             :  *
     908             :  * Callers whose scan uses array keys must save their curr_page argument so
     909             :  * that it can be passed to _bt_parallel_primscan_schedule, should caller
     910             :  * determine that another primitive index scan is required.
     911             :  *
     912             :  * If caller's next_scan_page is P_NONE, the scan has reached the index's
     913             :  * rightmost/leftmost page.  This is treated as reaching the end of the scan
     914             :  * within _bt_parallel_seize.
     915             :  *
     916             :  * Note: unlike the serial case, parallel scans don't need to remember both
     917             :  * sibling links.  next_scan_page is whichever link is next given the scan's
     918             :  * direction.  That's all we'll ever need, since the direction of a parallel
     919             :  * scan can never change.
     920             :  */
     921             : void
     922        1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
     923             :                      BlockNumber curr_page)
     924             : {
     925        1336 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     926             :     BTParallelScanDesc btscan;
     927             : 
     928             :     Assert(BlockNumberIsValid(next_scan_page));
     929             : 
     930        1336 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     931             :                                                   parallel_scan->ps_offset_am);
     932             : 
     933        1336 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     934        1336 :     btscan->btps_nextScanPage = next_scan_page;
     935        1336 :     btscan->btps_lastCurrPage = curr_page;
     936        1336 :     btscan->btps_pageStatus = BTPARALLEL_IDLE;
     937        1336 :     LWLockRelease(&btscan->btps_lock);
     938        1336 :     ConditionVariableSignal(&btscan->btps_cv);
     939        1336 : }
     940             : 
     941             : /*
     942             :  * _bt_parallel_done() -- Mark the parallel scan as complete.
     943             :  *
     944             :  * When there are no pages left to scan, this function should be called to
     945             :  * notify other workers.  Otherwise, they might wait forever for the scan to
     946             :  * advance to the next page.
     947             :  */
     948             : void
     949     6945586 : _bt_parallel_done(IndexScanDesc scan)
     950             : {
     951     6945586 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     952     6945586 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     953             :     BTParallelScanDesc btscan;
     954     6945586 :     bool        status_changed = false;
     955             : 
     956             :     Assert(!BTScanPosIsValid(so->currPos));
     957             : 
     958             :     /* Do nothing, for non-parallel scans */
     959     6945586 :     if (parallel_scan == NULL)
     960     6945430 :         return;
     961             : 
     962             :     /*
     963             :      * Should not mark parallel scan done when there's still a pending
     964             :      * primitive index scan
     965             :      */
     966         156 :     if (so->needPrimScan)
     967          36 :         return;
     968             : 
     969         120 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     970             :                                                   parallel_scan->ps_offset_am);
     971             : 
     972             :     /*
     973             :      * Mark the parallel scan as done, unless some other process did so
     974             :      * already
     975             :      */
     976         120 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     977             :     Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
     978         120 :     if (btscan->btps_pageStatus != BTPARALLEL_DONE)
     979             :     {
     980          88 :         btscan->btps_pageStatus = BTPARALLEL_DONE;
     981          88 :         status_changed = true;
     982             :     }
     983         120 :     LWLockRelease(&btscan->btps_lock);
     984             : 
     985             :     /* wake up all the workers associated with this parallel scan */
     986         120 :     if (status_changed)
     987          88 :         ConditionVariableBroadcast(&btscan->btps_cv);
     988             : }
     989             : 
     990             : /*
     991             :  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
     992             :  *
     993             :  * Caller passes the curr_page most recently passed to _bt_parallel_release
     994             :  * by its backend.  Caller successfully schedules the next primitive index scan
     995             :  * if the shared parallel state hasn't been seized since caller's backend last
     996             :  * advanced the scan.
     997             :  */
     998             : void
     999          36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
    1000             : {
    1001          36 :     Relation    rel = scan->indexRelation;
    1002          36 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1003          36 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
    1004             :     BTParallelScanDesc btscan;
    1005             : 
    1006             :     Assert(so->numArrayKeys);
    1007             : 
    1008          36 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
    1009             :                                                   parallel_scan->ps_offset_am);
    1010             : 
    1011          36 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    1012          36 :     if (btscan->btps_lastCurrPage == curr_page &&
    1013          36 :         btscan->btps_pageStatus == BTPARALLEL_IDLE)
    1014             :     {
    1015          36 :         btscan->btps_nextScanPage = InvalidBlockNumber;
    1016          36 :         btscan->btps_lastCurrPage = InvalidBlockNumber;
    1017          36 :         btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
    1018             : 
    1019             :         /* Serialize scan's current array keys */
    1020          36 :         _bt_parallel_serialize_arrays(rel, btscan, so);
    1021             :     }
    1022          36 :     LWLockRelease(&btscan->btps_lock);
    1023          36 : }
    1024             : 
    1025             : /*
    1026             :  * Bulk deletion of all index entries pointing to a set of heap tuples.
    1027             :  * The set of target tuples is specified via a callback routine that tells
    1028             :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
    1029             :  *
    1030             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
    1031             :  */
    1032             : IndexBulkDeleteResult *
    1033        2868 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    1034             :              IndexBulkDeleteCallback callback, void *callback_state)
    1035             : {
    1036        2868 :     Relation    rel = info->index;
    1037             :     BTCycleId   cycleid;
    1038             : 
    1039             :     /* allocate stats if first time through, else re-use existing struct */
    1040        2868 :     if (stats == NULL)
    1041        2852 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
    1042             : 
    1043             :     /* Establish the vacuum cycle ID to use for this scan */
    1044             :     /* The ENSURE stuff ensures we clean up shared memory on failure */
    1045        2868 :     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    1046             :     {
    1047        2868 :         cycleid = _bt_start_vacuum(rel);
    1048             : 
    1049        2868 :         btvacuumscan(info, stats, callback, callback_state, cycleid);
    1050             :     }
    1051        2868 :     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    1052        2868 :     _bt_end_vacuum(rel);
    1053             : 
    1054        2868 :     return stats;
    1055             : }
    1056             : 
    1057             : /*
    1058             :  * Post-VACUUM cleanup.
    1059             :  *
    1060             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
    1061             :  */
    1062             : IndexBulkDeleteResult *
    1063      177820 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
    1064             : {
    1065             :     BlockNumber num_delpages;
    1066             : 
    1067             :     /* No-op in ANALYZE ONLY mode */
    1068      177820 :     if (info->analyze_only)
    1069       17382 :         return stats;
    1070             : 
    1071             :     /*
    1072             :      * If btbulkdelete was called, we need not do anything (we just maintain
    1073             :      * the information used within _bt_vacuum_needs_cleanup() by calling
    1074             :      * _bt_set_cleanup_info() below).
    1075             :      *
    1076             :      * If btbulkdelete was _not_ called, then we have a choice to make: we
    1077             :      * must decide whether or not a btvacuumscan() call is needed now (i.e.
    1078             :      * whether the ongoing VACUUM operation can entirely avoid a physical scan
    1079             :      * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
    1080             :      * now.
    1081             :      */
    1082      160438 :     if (stats == NULL)
    1083             :     {
    1084             :         /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
    1085      158170 :         if (!_bt_vacuum_needs_cleanup(info->index))
    1086      158156 :             return NULL;
    1087             : 
    1088             :         /*
    1089             :          * Since we aren't going to actually delete any leaf items, there's no
    1090             :          * need to go through all the vacuum-cycle-ID pushups here.
    1091             :          *
    1092             :          * Posting list tuples are a source of inaccuracy for cleanup-only
    1093             :          * scans.  btvacuumscan() will assume that the number of index tuples
    1094             :          * from each page can be used as num_index_tuples, even though
    1095             :          * num_index_tuples is supposed to represent the number of TIDs in the
    1096             :          * index.  This naive approach can underestimate the number of tuples
    1097             :          * in the index significantly.
    1098             :          *
    1099             :          * We handle the problem by making num_index_tuples an estimate in
    1100             :          * cleanup-only case.
    1101             :          */
    1102          14 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
    1103          14 :         btvacuumscan(info, stats, NULL, NULL, 0);
    1104          14 :         stats->estimated_count = true;
    1105             :     }
    1106             : 
    1107             :     /*
    1108             :      * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
    1109             :      *
    1110             :      * num_delpages is the number of deleted pages now in the index that were
    1111             :      * not safe to place in the FSM to be recycled just yet.  num_delpages is
    1112             :      * greater than 0 only when _bt_pagedel() actually deleted pages during
    1113             :      * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
    1114             :      * have failed to place any newly deleted pages in the FSM just moments
    1115             :      * ago.  (Actually, there are edge cases where recycling of the current
    1116             :      * VACUUM's newly deleted pages does not even become safe by the time the
    1117             :      * next VACUUM comes around.  See nbtree/README.)
    1118             :      */
    1119             :     Assert(stats->pages_deleted >= stats->pages_free);
    1120        2282 :     num_delpages = stats->pages_deleted - stats->pages_free;
    1121        2282 :     _bt_set_cleanup_info(info->index, num_delpages);
    1122             : 
    1123             :     /*
    1124             :      * It's quite possible for us to be fooled by concurrent page splits into
    1125             :      * double-counting some index tuples, so disbelieve any total that exceeds
    1126             :      * the underlying heap's count ... if we know that accurately.  Otherwise
    1127             :      * this might just make matters worse.
    1128             :      */
    1129        2282 :     if (!info->estimated_count)
    1130             :     {
    1131        2228 :         if (stats->num_index_tuples > info->num_heap_tuples)
    1132          22 :             stats->num_index_tuples = info->num_heap_tuples;
    1133             :     }
    1134             : 
    1135        2282 :     return stats;
    1136             : }
    1137             : 
    1138             : /*
    1139             :  * btvacuumscan --- scan the index for VACUUMing purposes
    1140             :  *
    1141             :  * This combines the functions of looking for leaf tuples that are deletable
    1142             :  * according to the vacuum callback, looking for empty pages that can be
    1143             :  * deleted, and looking for old deleted pages that can be recycled.  Both
    1144             :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
    1145             :  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
    1146             :  *
    1147             :  * The caller is responsible for initially allocating/zeroing a stats struct
    1148             :  * and for obtaining a vacuum cycle ID if necessary.
    1149             :  */
    1150             : static void
    1151        2882 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    1152             :              IndexBulkDeleteCallback callback, void *callback_state,
    1153             :              BTCycleId cycleid)
    1154             : {
    1155        2882 :     Relation    rel = info->index;
    1156             :     BTVacState  vstate;
    1157             :     BlockNumber num_pages;
    1158             :     bool        needLock;
    1159             :     BlockRangeReadStreamPrivate p;
    1160        2882 :     ReadStream *stream = NULL;
    1161             : 
    1162             :     /*
    1163             :      * Reset fields that track information about the entire index now.  This
    1164             :      * avoids double-counting in the case where a single VACUUM command
    1165             :      * requires multiple scans of the index.
    1166             :      *
    1167             :      * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
    1168             :      * since they track information about the VACUUM command, and so must last
    1169             :      * across each call to btvacuumscan().
    1170             :      *
    1171             :      * (Note that pages_free is treated as state about the whole index, not
    1172             :      * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
    1173             :      * calls are idempotent, and get repeated for the same deleted pages in
    1174             :      * some scenarios.  The point for us is to track the number of recyclable
    1175             :      * pages in the index at the end of the VACUUM command.)
    1176             :      */
    1177        2882 :     stats->num_pages = 0;
    1178        2882 :     stats->num_index_tuples = 0;
    1179        2882 :     stats->pages_deleted = 0;
    1180        2882 :     stats->pages_free = 0;
    1181             : 
    1182             :     /* Set up info to pass down to btvacuumpage */
    1183        2882 :     vstate.info = info;
    1184        2882 :     vstate.stats = stats;
    1185        2882 :     vstate.callback = callback;
    1186        2882 :     vstate.callback_state = callback_state;
    1187        2882 :     vstate.cycleid = cycleid;
    1188             : 
    1189             :     /* Create a temporary memory context to run _bt_pagedel in */
    1190        2882 :     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
    1191             :                                                   "_bt_pagedel",
    1192             :                                                   ALLOCSET_DEFAULT_SIZES);
    1193             : 
    1194             :     /* Initialize vstate fields used by _bt_pendingfsm_finalize */
    1195        2882 :     vstate.bufsize = 0;
    1196        2882 :     vstate.maxbufsize = 0;
    1197        2882 :     vstate.pendingpages = NULL;
    1198        2882 :     vstate.npendingpages = 0;
    1199             :     /* Consider applying _bt_pendingfsm_finalize optimization */
    1200        2882 :     _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
    1201             : 
    1202             :     /*
    1203             :      * The outer loop iterates over all index pages except the metapage, in
    1204             :      * physical order (we hope the kernel will cooperate in providing
    1205             :      * read-ahead for speed).  It is critical that we visit all leaf pages,
    1206             :      * including ones added after we start the scan, else we might fail to
    1207             :      * delete some deletable tuples.  Hence, we must repeatedly check the
    1208             :      * relation length.  We must acquire the relation-extension lock while
    1209             :      * doing so to avoid a race condition: if someone else is extending the
    1210             :      * relation, there is a window where bufmgr/smgr have created a new
    1211             :      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
    1212             :      * we manage to scan such a page here, we'll improperly assume it can be
    1213             :      * recycled.  Taking the lock synchronizes things enough to prevent a
    1214             :      * problem: either num_pages won't include the new page, or _bt_getbuf
    1215             :      * already has write lock on the buffer and it will be fully initialized
    1216             :      * before we can examine it.  Also, we need not worry if a page is added
    1217             :      * immediately after we look; the page splitting code already has
    1218             :      * write-lock on the left page before it adds a right page, so we must
    1219             :      * already have processed any tuples due to be moved into such a page.
    1220             :      *
    1221             :      * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
    1222             :      * think the use of the extension lock is still required.
    1223             :      *
    1224             :      * We can skip locking for new or temp relations, however, since no one
    1225             :      * else could be accessing them.
    1226             :      */
    1227        2882 :     needLock = !RELATION_IS_LOCAL(rel);
    1228             : 
    1229        2882 :     p.current_blocknum = BTREE_METAPAGE + 1;
    1230             : 
    1231             :     /*
    1232             :      * It is safe to use batchmode as block_range_read_stream_cb takes no
    1233             :      * locks.
    1234             :      */
    1235        2882 :     stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
    1236             :                                         READ_STREAM_FULL |
    1237             :                                         READ_STREAM_USE_BATCHING,
    1238             :                                         info->strategy,
    1239             :                                         rel,
    1240             :                                         MAIN_FORKNUM,
    1241             :                                         block_range_read_stream_cb,
    1242             :                                         &p,
    1243             :                                         0);
    1244             :     for (;;)
    1245             :     {
    1246             :         /* Get the current relation length */
    1247        5492 :         if (needLock)
    1248        5488 :             LockRelationForExtension(rel, ExclusiveLock);
    1249        5492 :         num_pages = RelationGetNumberOfBlocks(rel);
    1250        5492 :         if (needLock)
    1251        5488 :             UnlockRelationForExtension(rel, ExclusiveLock);
    1252             : 
    1253        5492 :         if (info->report_progress)
    1254         894 :             pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
    1255             :                                          num_pages);
    1256             : 
    1257             :         /* Quit if we've scanned the whole relation */
    1258        5492 :         if (p.current_blocknum >= num_pages)
    1259        2882 :             break;
    1260             : 
    1261        2610 :         p.last_exclusive = num_pages;
    1262             : 
    1263             :         /* Iterate over pages, then loop back to recheck relation length */
    1264             :         while (true)
    1265       25566 :         {
    1266             :             BlockNumber current_block;
    1267             :             Buffer      buf;
    1268             : 
    1269             :             /* call vacuum_delay_point while not holding any buffer lock */
    1270       28176 :             vacuum_delay_point(false);
    1271             : 
    1272       28176 :             buf = read_stream_next_buffer(stream, NULL);
    1273             : 
    1274       28176 :             if (!BufferIsValid(buf))
    1275        2610 :                 break;
    1276             : 
    1277       25566 :             current_block = btvacuumpage(&vstate, buf);
    1278             : 
    1279       25566 :             if (info->report_progress)
    1280         952 :                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
    1281             :                                              current_block);
    1282             :         }
    1283             : 
    1284             :         /*
    1285             :          * We have to reset the read stream to use it again. After returning
    1286             :          * InvalidBuffer, the read stream API won't invoke our callback again
    1287             :          * until the stream has been reset.
    1288             :          */
    1289        2610 :         read_stream_reset(stream);
    1290             :     }
    1291             : 
    1292        2882 :     read_stream_end(stream);
    1293             : 
    1294             :     /* Set statistics num_pages field to final size of index */
    1295        2882 :     stats->num_pages = num_pages;
    1296             : 
    1297        2882 :     MemoryContextDelete(vstate.pagedelcontext);
    1298             : 
    1299             :     /*
    1300             :      * If there were any calls to _bt_pagedel() during scan of the index then
    1301             :      * see if any of the resulting pages can be placed in the FSM now.  When
    1302             :      * it's not safe we'll have to leave it up to a future VACUUM operation.
    1303             :      *
    1304             :      * Finally, if we placed any pages in the FSM (either just now or during
    1305             :      * the scan), forcibly update the upper-level FSM pages to ensure that
    1306             :      * searchers can find them.
    1307             :      */
    1308        2882 :     _bt_pendingfsm_finalize(rel, &vstate);
    1309        2882 :     if (stats->pages_free > 0)
    1310          48 :         IndexFreeSpaceMapVacuum(rel);
    1311        2882 : }
    1312             : 
    1313             : /*
    1314             :  * btvacuumpage --- VACUUM one page
    1315             :  *
    1316             :  * This processes a single page for btvacuumscan().  In some cases we must
    1317             :  * backtrack to re-examine and VACUUM pages that were on buf's page during
    1318             :  * a previous call here.  This is how we handle page splits (that happened
    1319             :  * after our cycleid was acquired) whose right half page happened to reuse
    1320             :  * a block that we might have processed at some point before it was
    1321             :  * recycled (i.e. before the page split).
    1322             :  *
    1323             :  * Returns BlockNumber of a scanned page (not backtracked).
    1324             :  */
    1325             : static BlockNumber
    1326       25566 : btvacuumpage(BTVacState *vstate, Buffer buf)
    1327             : {
    1328       25566 :     IndexVacuumInfo *info = vstate->info;
    1329       25566 :     IndexBulkDeleteResult *stats = vstate->stats;
    1330       25566 :     IndexBulkDeleteCallback callback = vstate->callback;
    1331       25566 :     void       *callback_state = vstate->callback_state;
    1332       25566 :     Relation    rel = info->index;
    1333       25566 :     Relation    heaprel = info->heaprel;
    1334             :     bool        attempt_pagedel;
    1335             :     BlockNumber blkno,
    1336             :                 backtrack_to;
    1337       25566 :     BlockNumber scanblkno = BufferGetBlockNumber(buf);
    1338             :     Page        page;
    1339             :     BTPageOpaque opaque;
    1340             : 
    1341       25566 :     blkno = scanblkno;
    1342             : 
    1343       25566 : backtrack:
    1344             : 
    1345       25566 :     attempt_pagedel = false;
    1346       25566 :     backtrack_to = P_NONE;
    1347             : 
    1348       25566 :     _bt_lockbuf(rel, buf, BT_READ);
    1349       25566 :     page = BufferGetPage(buf);
    1350       25566 :     opaque = NULL;
    1351       25566 :     if (!PageIsNew(page))
    1352             :     {
    1353       25566 :         _bt_checkpage(rel, buf);
    1354       25566 :         opaque = BTPageGetOpaque(page);
    1355             :     }
    1356             : 
    1357             :     Assert(blkno <= scanblkno);
    1358       25566 :     if (blkno != scanblkno)
    1359             :     {
    1360             :         /*
    1361             :          * We're backtracking.
    1362             :          *
    1363             :          * We followed a right link to a sibling leaf page (a page that
    1364             :          * happens to be from a block located before scanblkno).  The only
    1365             :          * case we want to do anything with is a live leaf page having the
    1366             :          * current vacuum cycle ID.
    1367             :          *
    1368             :          * The page had better be in a state that's consistent with what we
    1369             :          * expect.  Check for conditions that imply corruption in passing.  It
    1370             :          * can't be half-dead because only an interrupted VACUUM process can
    1371             :          * leave pages in that state, so we'd definitely have dealt with it
    1372             :          * back when the page was the scanblkno page (half-dead pages are
    1373             :          * always marked fully deleted by _bt_pagedel(), barring corruption).
    1374             :          */
    1375           0 :         if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
    1376             :         {
    1377             :             Assert(false);
    1378           0 :             ereport(LOG,
    1379             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1380             :                      errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
    1381             :                                      blkno, scanblkno, RelationGetRelationName(rel))));
    1382           0 :             _bt_relbuf(rel, buf);
    1383           0 :             return scanblkno;
    1384             :         }
    1385             : 
    1386             :         /*
    1387             :          * We may have already processed the page in an earlier call, when the
    1388             :          * page was scanblkno.  This happens when the leaf page split occurred
    1389             :          * after the scan began, but before the right sibling page became the
    1390             :          * scanblkno.
    1391             :          *
    1392             :          * Page may also have been deleted by current btvacuumpage() call,
    1393             :          * since _bt_pagedel() sometimes deletes the right sibling page of
    1394             :          * scanblkno in passing (it does so after we decided where to
    1395             :          * backtrack to).  We don't need to process this page as a deleted
    1396             :          * page a second time now (in fact, it would be wrong to count it as a
    1397             :          * deleted page in the bulk delete statistics a second time).
    1398             :          */
    1399           0 :         if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
    1400             :         {
    1401             :             /* Done with current scanblkno (and all lower split pages) */
    1402           0 :             _bt_relbuf(rel, buf);
    1403           0 :             return scanblkno;
    1404             :         }
    1405             :     }
    1406             : 
    1407       25566 :     if (!opaque || BTPageIsRecyclable(page, heaprel))
    1408             :     {
    1409             :         /* Okay to recycle this page (which could be leaf or internal) */
    1410        1248 :         RecordFreeIndexPage(rel, blkno);
    1411        1248 :         stats->pages_deleted++;
    1412        1248 :         stats->pages_free++;
    1413             :     }
    1414       24318 :     else if (P_ISDELETED(opaque))
    1415             :     {
    1416             :         /*
    1417             :          * Already deleted page (which could be leaf or internal).  Can't
    1418             :          * recycle yet.
    1419             :          */
    1420         218 :         stats->pages_deleted++;
    1421             :     }
    1422       24100 :     else if (P_ISHALFDEAD(opaque))
    1423             :     {
    1424             :         /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
    1425           0 :         attempt_pagedel = true;
    1426             : 
    1427             :         /*
    1428             :          * _bt_pagedel() will increment both pages_newly_deleted and
    1429             :          * pages_deleted stats in all cases (barring corruption)
    1430             :          */
    1431             :     }
    1432       24100 :     else if (P_ISLEAF(opaque))
    1433             :     {
    1434             :         OffsetNumber deletable[MaxIndexTuplesPerPage];
    1435             :         int         ndeletable;
    1436             :         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1437             :         int         nupdatable;
    1438             :         OffsetNumber offnum,
    1439             :                     minoff,
    1440             :                     maxoff;
    1441             :         int         nhtidsdead,
    1442             :                     nhtidslive;
    1443             : 
    1444             :         /*
    1445             :          * Trade in the initial read lock for a full cleanup lock on this
    1446             :          * page.  We must get such a lock on every leaf page over the course
    1447             :          * of the vacuum scan, whether or not it actually contains any
    1448             :          * deletable tuples --- see nbtree/README.
    1449             :          */
    1450       22550 :         _bt_upgradelockbufcleanup(rel, buf);
    1451             : 
    1452             :         /*
    1453             :          * Check whether we need to backtrack to earlier pages.  What we are
    1454             :          * concerned about is a page split that happened since we started the
    1455             :          * vacuum scan.  If the split moved tuples on the right half of the
    1456             :          * split (i.e. the tuples that sort high) to a block that we already
    1457             :          * passed over, then we might have missed the tuples.  We need to
    1458             :          * backtrack now.  (Must do this before possibly clearing btpo_cycleid
    1459             :          * or deleting scanblkno page below!)
    1460             :          */
    1461       22550 :         if (vstate->cycleid != 0 &&
    1462       22406 :             opaque->btpo_cycleid == vstate->cycleid &&
    1463           8 :             !(opaque->btpo_flags & BTP_SPLIT_END) &&
    1464           4 :             !P_RIGHTMOST(opaque) &&
    1465           4 :             opaque->btpo_next < scanblkno)
    1466           0 :             backtrack_to = opaque->btpo_next;
    1467             : 
    1468       22550 :         ndeletable = 0;
    1469       22550 :         nupdatable = 0;
    1470       22550 :         minoff = P_FIRSTDATAKEY(opaque);
    1471       22550 :         maxoff = PageGetMaxOffsetNumber(page);
    1472       22550 :         nhtidsdead = 0;
    1473       22550 :         nhtidslive = 0;
    1474       22550 :         if (callback)
    1475             :         {
    1476             :             /* btbulkdelete callback tells us what to delete (or update) */
    1477     4428582 :             for (offnum = minoff;
    1478             :                  offnum <= maxoff;
    1479     4406176 :                  offnum = OffsetNumberNext(offnum))
    1480             :             {
    1481             :                 IndexTuple  itup;
    1482             : 
    1483     4406176 :                 itup = (IndexTuple) PageGetItem(page,
    1484     4406176 :                                                 PageGetItemId(page, offnum));
    1485             : 
    1486             :                 Assert(!BTreeTupleIsPivot(itup));
    1487     4406176 :                 if (!BTreeTupleIsPosting(itup))
    1488             :                 {
    1489             :                     /* Regular tuple, standard table TID representation */
    1490     4264400 :                     if (callback(&itup->t_tid, callback_state))
    1491             :                     {
    1492     1605756 :                         deletable[ndeletable++] = offnum;
    1493     1605756 :                         nhtidsdead++;
    1494             :                     }
    1495             :                     else
    1496     2658644 :                         nhtidslive++;
    1497             :                 }
    1498             :                 else
    1499             :                 {
    1500             :                     BTVacuumPosting vacposting;
    1501             :                     int         nremaining;
    1502             : 
    1503             :                     /* Posting list tuple */
    1504      141776 :                     vacposting = btreevacuumposting(vstate, itup, offnum,
    1505             :                                                     &nremaining);
    1506      141776 :                     if (vacposting == NULL)
    1507             :                     {
    1508             :                         /*
    1509             :                          * All table TIDs from the posting tuple remain, so no
    1510             :                          * delete or update required
    1511             :                          */
    1512             :                         Assert(nremaining == BTreeTupleGetNPosting(itup));
    1513             :                     }
    1514       90636 :                     else if (nremaining > 0)
    1515             :                     {
    1516             : 
    1517             :                         /*
    1518             :                          * Store metadata about posting list tuple in
    1519             :                          * updatable array for entire page.  Existing tuple
    1520             :                          * will be updated during the later call to
    1521             :                          * _bt_delitems_vacuum().
    1522             :                          */
    1523             :                         Assert(nremaining < BTreeTupleGetNPosting(itup));
    1524       39644 :                         updatable[nupdatable++] = vacposting;
    1525       39644 :                         nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
    1526             :                     }
    1527             :                     else
    1528             :                     {
    1529             :                         /*
    1530             :                          * All table TIDs from the posting list must be
    1531             :                          * deleted.  We'll delete the index tuple completely
    1532             :                          * (no update required).
    1533             :                          */
    1534             :                         Assert(nremaining == 0);
    1535       50992 :                         deletable[ndeletable++] = offnum;
    1536       50992 :                         nhtidsdead += BTreeTupleGetNPosting(itup);
    1537       50992 :                         pfree(vacposting);
    1538             :                     }
    1539             : 
    1540      141776 :                     nhtidslive += nremaining;
    1541             :                 }
    1542             :             }
    1543             :         }
    1544             : 
    1545             :         /*
    1546             :          * Apply any needed deletes or updates.  We issue just one
    1547             :          * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
    1548             :          */
    1549       22550 :         if (ndeletable > 0 || nupdatable > 0)
    1550             :         {
    1551             :             Assert(nhtidsdead >= ndeletable + nupdatable);
    1552       14672 :             _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
    1553             :                                 nupdatable);
    1554             : 
    1555       14672 :             stats->tuples_removed += nhtidsdead;
    1556             :             /* must recompute maxoff */
    1557       14672 :             maxoff = PageGetMaxOffsetNumber(page);
    1558             : 
    1559             :             /* can't leak memory here */
    1560       54316 :             for (int i = 0; i < nupdatable; i++)
    1561       39644 :                 pfree(updatable[i]);
    1562             :         }
    1563             :         else
    1564             :         {
    1565             :             /*
    1566             :              * If the leaf page has been split during this vacuum cycle, it
    1567             :              * seems worth expending a write to clear btpo_cycleid even if we
    1568             :              * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
    1569             :              * takes care of this.)  This ensures we won't process the page
    1570             :              * again.
    1571             :              *
    1572             :              * We treat this like a hint-bit update because there's no need to
    1573             :              * WAL-log it.
    1574             :              */
    1575             :             Assert(nhtidsdead == 0);
    1576        7878 :             if (vstate->cycleid != 0 &&
    1577        7734 :                 opaque->btpo_cycleid == vstate->cycleid)
    1578             :             {
    1579           0 :                 opaque->btpo_cycleid = 0;
    1580           0 :                 MarkBufferDirtyHint(buf, true);
    1581             :             }
    1582             :         }
    1583             : 
    1584             :         /*
    1585             :          * If the leaf page is now empty, try to delete it; else count the
    1586             :          * live tuples (live table TIDs in posting lists are counted as
    1587             :          * separate live tuples).  We don't delete when backtracking, though,
    1588             :          * since that would require teaching _bt_pagedel() about backtracking
    1589             :          * (doesn't seem worth adding more complexity to deal with that).
    1590             :          *
    1591             :          * We don't count the number of live TIDs during cleanup-only calls to
    1592             :          * btvacuumscan (i.e. when callback is not set).  We count the number
    1593             :          * of index tuples directly instead.  This avoids the expense of
    1594             :          * directly examining all of the tuples on each page.  VACUUM will
    1595             :          * treat num_index_tuples as an estimate in cleanup-only case, so it
    1596             :          * doesn't matter that this underestimates num_index_tuples
    1597             :          * significantly in some cases.
    1598             :          */
    1599       22550 :         if (minoff > maxoff)
    1600        5720 :             attempt_pagedel = (blkno == scanblkno);
    1601       16830 :         else if (callback)
    1602       16694 :             stats->num_index_tuples += nhtidslive;
    1603             :         else
    1604         136 :             stats->num_index_tuples += maxoff - minoff + 1;
    1605             : 
    1606             :         Assert(!attempt_pagedel || nhtidslive == 0);
    1607             :     }
    1608             : 
    1609       25566 :     if (attempt_pagedel)
    1610             :     {
    1611             :         MemoryContext oldcontext;
    1612             : 
    1613             :         /* Run pagedel in a temp context to avoid memory leakage */
    1614        5720 :         MemoryContextReset(vstate->pagedelcontext);
    1615        5720 :         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
    1616             : 
    1617             :         /*
    1618             :          * _bt_pagedel maintains the bulk delete stats on our behalf;
    1619             :          * pages_newly_deleted and pages_deleted are likely to be incremented
    1620             :          * during call
    1621             :          */
    1622             :         Assert(blkno == scanblkno);
    1623        5720 :         _bt_pagedel(rel, buf, vstate);
    1624             : 
    1625        5720 :         MemoryContextSwitchTo(oldcontext);
    1626             :         /* pagedel released buffer, so we shouldn't */
    1627             :     }
    1628             :     else
    1629       19846 :         _bt_relbuf(rel, buf);
    1630             : 
    1631       25566 :     if (backtrack_to != P_NONE)
    1632             :     {
    1633           0 :         blkno = backtrack_to;
    1634             : 
    1635             :         /* check for vacuum delay while not holding any buffer lock */
    1636           0 :         vacuum_delay_point(false);
    1637             : 
    1638             :         /*
    1639             :          * We can't use _bt_getbuf() here because it always applies
    1640             :          * _bt_checkpage(), which will barf on an all-zero page. We want to
    1641             :          * recycle all-zero pages, not fail.  Also, we want to use a
    1642             :          * nondefault buffer access strategy.
    1643             :          */
    1644           0 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
    1645             :                                  info->strategy);
    1646           0 :         goto backtrack;
    1647             :     }
    1648             : 
    1649       25566 :     return scanblkno;
    1650             : }
    1651             : 
    1652             : /*
    1653             :  * btreevacuumposting --- determine TIDs still needed in posting list
    1654             :  *
    1655             :  * Returns metadata describing how to build replacement tuple without the TIDs
    1656             :  * that VACUUM needs to delete.  Returned value is NULL in the common case
    1657             :  * where no changes are needed to caller's posting list tuple (we avoid
    1658             :  * allocating memory here as an optimization).
    1659             :  *
    1660             :  * The number of TIDs that should remain in the posting list tuple is set for
    1661             :  * caller in *nremaining.
    1662             :  */
    1663             : static BTVacuumPosting
    1664      141776 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
    1665             :                    OffsetNumber updatedoffset, int *nremaining)
    1666             : {
    1667      141776 :     int         live = 0;
    1668      141776 :     int         nitem = BTreeTupleGetNPosting(posting);
    1669      141776 :     ItemPointer items = BTreeTupleGetPosting(posting);
    1670      141776 :     BTVacuumPosting vacposting = NULL;
    1671             : 
    1672      789962 :     for (int i = 0; i < nitem; i++)
    1673             :     {
    1674      648186 :         if (!vstate->callback(items + i, vstate->callback_state))
    1675             :         {
    1676             :             /* Live table TID */
    1677      333286 :             live++;
    1678             :         }
    1679      314900 :         else if (vacposting == NULL)
    1680             :         {
    1681             :             /*
    1682             :              * First dead table TID encountered.
    1683             :              *
    1684             :              * It's now clear that we need to delete one or more dead table
    1685             :              * TIDs, so start maintaining metadata describing how to update
    1686             :              * existing posting list tuple.
    1687             :              */
    1688       90636 :             vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1689             :                                 nitem * sizeof(uint16));
    1690             : 
    1691       90636 :             vacposting->itup = posting;
    1692       90636 :             vacposting->updatedoffset = updatedoffset;
    1693       90636 :             vacposting->ndeletedtids = 0;
    1694       90636 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1695             :         }
    1696             :         else
    1697             :         {
    1698             :             /* Second or subsequent dead table TID */
    1699      224264 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1700             :         }
    1701             :     }
    1702             : 
    1703      141776 :     *nremaining = live;
    1704      141776 :     return vacposting;
    1705             : }
    1706             : 
    1707             : /*
    1708             :  *  btcanreturn() -- Check whether btree indexes support index-only scans.
    1709             :  *
    1710             :  * btrees always do, so this is trivial.
    1711             :  */
    1712             : bool
    1713     1124834 : btcanreturn(Relation index, int attno)
    1714             : {
    1715     1124834 :     return true;
    1716             : }
    1717             : 
    1718             : /*
    1719             :  * btgettreeheight() -- Compute tree height for use by btcostestimate().
    1720             :  */
    1721             : int
    1722      718578 : btgettreeheight(Relation rel)
    1723             : {
    1724      718578 :     return _bt_getrootheight(rel);
    1725             : }
    1726             : 
    1727             : CompareType
    1728           0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
    1729             : {
    1730           0 :     switch (strategy)
    1731             :     {
    1732           0 :         case BTLessStrategyNumber:
    1733           0 :             return COMPARE_LT;
    1734           0 :         case BTLessEqualStrategyNumber:
    1735           0 :             return COMPARE_LE;
    1736           0 :         case BTEqualStrategyNumber:
    1737           0 :             return COMPARE_EQ;
    1738           0 :         case BTGreaterEqualStrategyNumber:
    1739           0 :             return COMPARE_GE;
    1740           0 :         case BTGreaterStrategyNumber:
    1741           0 :             return COMPARE_GT;
    1742           0 :         default:
    1743           0 :             return COMPARE_INVALID;
    1744             :     }
    1745             : }
    1746             : 
    1747             : StrategyNumber
    1748           0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
    1749             : {
    1750           0 :     switch (cmptype)
    1751             :     {
    1752           0 :         case COMPARE_LT:
    1753           0 :             return BTLessStrategyNumber;
    1754           0 :         case COMPARE_LE:
    1755           0 :             return BTLessEqualStrategyNumber;
    1756           0 :         case COMPARE_EQ:
    1757           0 :             return BTEqualStrategyNumber;
    1758           0 :         case COMPARE_GE:
    1759           0 :             return BTGreaterEqualStrategyNumber;
    1760           0 :         case COMPARE_GT:
    1761           0 :             return BTGreaterStrategyNumber;
    1762           0 :         default:
    1763           0 :             return InvalidStrategy;
    1764             :     }
    1765             : }

Generated by: LCOV version 1.14