diff options
| author | Tom Lane | 2006-05-07 01:21:30 +0000 |
|---|---|---|
| committer | Tom Lane | 2006-05-07 01:21:30 +0000 |
| commit | 09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 (patch) | |
| tree | 650b81f13757ed3c99638c65329475e682337646 /src/include/access | |
| parent | 88d94a11bb2b0d3dac95325d8d6056e4bac8b4b8 (diff) | |
Rewrite btree index scans to work a page at a time in all cases (both
btgettuple and btgetmulti). This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key. However, the advantages for non-unique cases seem great enough to
accept this tradeoff. Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM. Those changes will come in a separate patch.
Original patch by Heikki Linnakangas, rework by Tom Lane.
Diffstat (limited to 'src/include/access')
| -rw-r--r-- | src/include/access/itup.h | 13 | ||||
| -rw-r--r-- | src/include/access/nbtree.h | 86 | ||||
| -rw-r--r-- | src/include/access/relscan.h | 18 |
3 files changed, 84 insertions, 33 deletions
diff --git a/src/include/access/itup.h b/src/include/access/itup.h index 26e489af6b..d93451a28f 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; ) \ ) +/* + * MaxIndexTuplesPerPage is an upper bound on the number of tuples that can + * fit on one index page. An index tuple must have either data or a null + * bitmap, so we can safely assume it's at least 1 byte bigger than a bare + * IndexTupleData struct. We arrive at the divisor because each tuple + * must be maxaligned, and it must have an associated item pointer. + */ +#define MaxIndexTuplesPerPage \ + ((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \ + (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) + /* routines in indextuple.c */ extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4b254f1fd1..1831c66216 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -336,30 +336,82 @@ typedef struct BTStackData typedef BTStackData *BTStack; /* - * BTScanOpaqueData is used to remember which buffers we're currently - * examining in an indexscan. Between calls to btgettuple or btgetmulti, - * we keep these buffers pinned (but not locked, see nbtree.c) to avoid - * doing a ReadBuffer() for every tuple in the index. + * BTScanOpaqueData is the btree-private state needed for an indexscan. + * This consists of preprocessed scan keys (see _bt_preprocess_keys() for + * details of the preprocessing), information about the current location + * of the scan, and information about the marked location, if any. (We use + * BTScanPosData to represent the data needed for each of current and marked + * locations.) In addition we can remember some known-killed index entries + * that must be marked before we can move off the current page. * - * We also store preprocessed versions of the scan keys in this structure. - * See _bt_preprocess_keys() for details of the preprocessing. + * Index scans work a page at a time: we pin and read-lock the page, identify + * all the matching items on the page and save them in BTScanPosData, then + * release the read-lock while returning the items to the caller for + * processing. This approach minimizes lock/unlock traffic. Note that we + * keep the pin on the index page until the caller is done with all the items + * (this is needed for VACUUM synchronization, see nbtree/README). When we + * are ready to step to the next page, if the caller has told us any of the + * items were killed, we re-lock the page to mark them killed, then unlock. + * Finally we drop the pin and step to the next page in the appropriate + * direction. * - * curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked - * index tuples: we don't adjust scans on insertions - instead we - * use these pointers to restore index scan positions... - * - vadim 07/29/98 + * NOTE: in this implementation, btree does not use or set the + * currentItemData and currentMarkData fields of IndexScanDesc at all. */ +typedef struct BTScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ +} BTScanPosItem; + +typedef struct BTScanPosData +{ + Buffer buf; /* if valid, the buffer is pinned */ + + BlockNumber nextPage; /* page's right link when we scanned it */ + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + * We can clear the appropriate one of these flags when _bt_checkkeys() + * returns continuescan = false. + */ + bool moreLeft; + bool moreRight; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ +} BTScanPosData; + +typedef BTScanPosData *BTScanPos; + +#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf) + typedef struct BTScanOpaqueData { - Buffer btso_curbuf; - Buffer btso_mrkbuf; - ItemPointerData curHeapIptr; - ItemPointerData mrkHeapIptr; /* these fields are set by _bt_preprocess_keys(): */ bool qual_ok; /* false if qual can never be satisfied */ int numberOfKeys; /* number of preprocessed scan keys */ ScanKey keyData; /* array of preprocessed scan keys */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* currPos.items indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* keep these last in struct for efficiency */ + BTScanPosData currPos; /* current position data */ + BTScanPosData markPos; /* marked position, if any */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum); -extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); +extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); /* @@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan); +extern void _bt_killitems(IndexScanDesc scan, bool haveLock); /* * prototypes for functions in nbtsort.c diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 09e977c065..e7c409ed0e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -67,12 +67,9 @@ typedef struct IndexScanDescData bool kill_prior_tuple; /* last-returned tuple is dead */ bool ignore_killed_tuples; /* do not return killed entries */ - /* set by index AM if scan keys satisfy index's uniqueness constraint */ - bool keys_are_unique; - - /* scan current state */ - bool got_tuple; /* true after successful index_getnext */ + /* index access method's private state */ void *opaque; /* access-method-specific info */ + /* these fields are used by some but not all AMs: */ ItemPointerData currentItemData; /* current index pointer */ ItemPointerData currentMarkData; /* marked position, if any */ @@ -85,15 +82,6 @@ typedef struct IndexScanDescData Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - /* - * If keys_are_unique and got_tuple are both true, we stop calling the - * index AM; it is then necessary for index_getnext to keep track of the - * logical scan position for itself. It does that using unique_tuple_pos: - * -1 = before row, 0 = on row, +1 = after row. - */ - int unique_tuple_pos; /* logical position */ - int unique_tuple_mark; /* logical marked position */ - PgStat_Info xs_pgstat_info; /* statistics collector hook */ } IndexScanDescData; |
