summaryrefslogtreecommitdiff
path: root/src/include/access
diff options
context:
space:
mode:
authorTom Lane2006-05-07 01:21:30 +0000
committerTom Lane2006-05-07 01:21:30 +0000
commit09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 (patch)
tree650b81f13757ed3c99638c65329475e682337646 /src/include/access
parent88d94a11bb2b0d3dac95325d8d6056e4bac8b4b8 (diff)
Rewrite btree index scans to work a page at a time in all cases (both
btgettuple and btgetmulti). This eliminates the problem of "re-finding" the exact stopping point, since the stopping point is effectively always a page boundary, and index items are never moved across pre-existing page boundaries. A small penalty is that the keys_are_unique optimization is effectively disabled (and, therefore, is removed in this patch), causing us to apply _bt_checkkeys() to at least one more tuple than necessary when looking up a unique key. However, the advantages for non-unique cases seem great enough to accept this tradeoff. Aside from simplifying and (sometimes) speeding up the indexscan code, this will allow us to reimplement btbulkdelete as a largely sequential scan instead of index-order traversal, thereby significantly reducing the cost of VACUUM. Those changes will come in a separate patch. Original patch by Heikki Linnakangas, rework by Tom Lane.
Diffstat (limited to 'src/include/access')
-rw-r--r--src/include/access/itup.h13
-rw-r--r--src/include/access/nbtree.h86
-rw-r--r--src/include/access/relscan.h18
3 files changed, 84 insertions, 33 deletions
diff --git a/src/include/access/itup.h b/src/include/access/itup.h
index 26e489af6b..d93451a28f 100644
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
) \
)
+/*
+ * MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
+ * fit on one index page. An index tuple must have either data or a null
+ * bitmap, so we can safely assume it's at least 1 byte bigger than a bare
+ * IndexTupleData struct. We arrive at the divisor because each tuple
+ * must be maxaligned, and it must have an associated item pointer.
+ */
+#define MaxIndexTuplesPerPage \
+ ((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
+ (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
+
/* routines in indextuple.c */
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 4b254f1fd1..1831c66216 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -336,30 +336,82 @@ typedef struct BTStackData
typedef BTStackData *BTStack;
/*
- * BTScanOpaqueData is used to remember which buffers we're currently
- * examining in an indexscan. Between calls to btgettuple or btgetmulti,
- * we keep these buffers pinned (but not locked, see nbtree.c) to avoid
- * doing a ReadBuffer() for every tuple in the index.
+ * BTScanOpaqueData is the btree-private state needed for an indexscan.
+ * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
+ * details of the preprocessing), information about the current location
+ * of the scan, and information about the marked location, if any. (We use
+ * BTScanPosData to represent the data needed for each of current and marked
+ * locations.) In addition we can remember some known-killed index entries
+ * that must be marked before we can move off the current page.
*
- * We also store preprocessed versions of the scan keys in this structure.
- * See _bt_preprocess_keys() for details of the preprocessing.
+ * Index scans work a page at a time: we pin and read-lock the page, identify
+ * all the matching items on the page and save them in BTScanPosData, then
+ * release the read-lock while returning the items to the caller for
+ * processing. This approach minimizes lock/unlock traffic. Note that we
+ * keep the pin on the index page until the caller is done with all the items
+ * (this is needed for VACUUM synchronization, see nbtree/README). When we
+ * are ready to step to the next page, if the caller has told us any of the
+ * items were killed, we re-lock the page to mark them killed, then unlock.
+ * Finally we drop the pin and step to the next page in the appropriate
+ * direction.
*
- * curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
- * index tuples: we don't adjust scans on insertions - instead we
- * use these pointers to restore index scan positions...
- * - vadim 07/29/98
+ * NOTE: in this implementation, btree does not use or set the
+ * currentItemData and currentMarkData fields of IndexScanDesc at all.
*/
+typedef struct BTScanPosItem /* what we remember about each match */
+{
+ ItemPointerData heapTid; /* TID of referenced heap item */
+ OffsetNumber indexOffset; /* index item's location within page */
+} BTScanPosItem;
+
+typedef struct BTScanPosData
+{
+ Buffer buf; /* if valid, the buffer is pinned */
+
+ BlockNumber nextPage; /* page's right link when we scanned it */
+
+ /*
+ * moreLeft and moreRight track whether we think there may be matching
+ * index entries to the left and right of the current page, respectively.
+ * We can clear the appropriate one of these flags when _bt_checkkeys()
+ * returns continuescan = false.
+ */
+ bool moreLeft;
+ bool moreRight;
+
+ /*
+ * The items array is always ordered in index order (ie, increasing
+ * indexoffset). When scanning backwards it is convenient to fill the
+ * array back-to-front, so we start at the last slot and fill downwards.
+ * Hence we need both a first-valid-entry and a last-valid-entry counter.
+ * itemIndex is a cursor showing which entry was last returned to caller.
+ */
+ int firstItem; /* first valid index in items[] */
+ int lastItem; /* last valid index in items[] */
+ int itemIndex; /* current index in items[] */
+
+ BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+} BTScanPosData;
+
+typedef BTScanPosData *BTScanPos;
+
+#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
+
typedef struct BTScanOpaqueData
{
- Buffer btso_curbuf;
- Buffer btso_mrkbuf;
- ItemPointerData curHeapIptr;
- ItemPointerData mrkHeapIptr;
/* these fields are set by _bt_preprocess_keys(): */
bool qual_ok; /* false if qual can never be satisfied */
int numberOfKeys; /* number of preprocessed scan keys */
ScanKey keyData; /* array of preprocessed scan keys */
+
+ /* info about killed items if any (killedItems is NULL if never used) */
+ int *killedItems; /* currPos.items indexes of killed items */
+ int numKilled; /* number of currently stored items */
+
+ /* keep these last in struct for efficiency */
+ BTScanPosData currPos; /* current position data */
+ BTScanPosData markPos; /* marked position, if any */
} BTScanOpaqueData;
typedef BTScanOpaqueData *BTScanOpaque;
@@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
-extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/*
@@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan);
+extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
/*
* prototypes for functions in nbtsort.c
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 09e977c065..e7c409ed0e 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -67,12 +67,9 @@ typedef struct IndexScanDescData
bool kill_prior_tuple; /* last-returned tuple is dead */
bool ignore_killed_tuples; /* do not return killed entries */
- /* set by index AM if scan keys satisfy index's uniqueness constraint */
- bool keys_are_unique;
-
- /* scan current state */
- bool got_tuple; /* true after successful index_getnext */
+ /* index access method's private state */
void *opaque; /* access-method-specific info */
+ /* these fields are used by some but not all AMs: */
ItemPointerData currentItemData; /* current index pointer */
ItemPointerData currentMarkData; /* marked position, if any */
@@ -85,15 +82,6 @@ typedef struct IndexScanDescData
Buffer xs_cbuf; /* current heap buffer in scan, if any */
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
- /*
- * If keys_are_unique and got_tuple are both true, we stop calling the
- * index AM; it is then necessary for index_getnext to keep track of the
- * logical scan position for itself. It does that using unique_tuple_pos:
- * -1 = before row, 0 = on row, +1 = after row.
- */
- int unique_tuple_pos; /* logical position */
- int unique_tuple_mark; /* logical marked position */
-
PgStat_Info xs_pgstat_info; /* statistics collector hook */
} IndexScanDescData;