Adjust INCLUDE index truncation comments and code.

author Teodor Sigaev <teodor@sigaev.ru>

Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)

committer Teodor Sigaev <teodor@sigaev.ru>

Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)
author Teodor Sigaev <teodor@sigaev.ru>
Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)
committer Teodor Sigaev <teodor@sigaev.ru>
Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c

index be0206d58ed1658a903374577320d6a874febefe..1a605f9944223107004d0739670b8fce88672616 100644 (file)
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -698,6 +698,9 @@ nextpage:
   *      "real" data item on the page to the right (if such a first item is
   *      available).
   *
+ * - That tuples report that they have the expected number of attributes.
+ *      INCLUDE index pivot tuples should not contain non-key attributes.
+ *
   * Furthermore, when state passed shows ShareLock held, and target page is
   * internal page, function also checks:
   *
@@ -722,43 +725,35 @@ bt_target_page_check(BtreeCheckState *state)
         elog(DEBUG2, "verifying %u items on %s block %u", max,
                  P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
  
-
-       /* Check the number of attributes in high key if any */
-       if (!P_RIGHTMOST(topaque))
+       /*
+        * Check the number of attributes in high key. Note, rightmost page doesn't
+        * contain a high key, so nothing to check
+        */
+       if (!P_RIGHTMOST(topaque) &&
+               !_bt_check_natts(state->rel, state->target, P_HIKEY))
         {
-               if (!_bt_check_natts(state->rel, state->target, P_HIKEY))
-               {
-                       ItemId          itemid;
-                       IndexTuple      itup;
-                       char       *itid,
-                                          *htid;
+               ItemId          itemid;
+               IndexTuple      itup;
  
-                       itemid = PageGetItemId(state->target, P_HIKEY);
-                       itup = (IndexTuple) PageGetItem(state->target, itemid);
-                       itid = psprintf("(%u,%u)", state->targetblock, P_HIKEY);
-                       htid = psprintf("(%u,%u)",
-                                                       ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-                                                       ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+               itemid = PageGetItemId(state->target, P_HIKEY);
+               itup = (IndexTuple) PageGetItem(state->target, itemid);
  
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_INDEX_CORRUPTED),
-                                        errmsg("wrong number of index tuple attributes for index \"%s\"",
-                                                       RelationGetRelationName(state->rel)),
-                                        errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
-                                                                               itid,
-                                                                               BTreeTupGetNAtts(itup, state->rel),
-                                                                               P_ISLEAF(topaque) ? "heap" : "index",
-                                                                               htid,
-                                                                               (uint32) (state->targetlsn >> 32),
-                                                                               (uint32) state->targetlsn)));
-               }
+               ereport(ERROR,
+                               (errcode(ERRCODE_INDEX_CORRUPTED),
+                                errmsg("wrong number of high key index tuple attributes in index \"%s\"",
+                                               RelationGetRelationName(state->rel)),
+                                errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.",
+                                                                       state->targetblock,
+                                                                       BTreeTupleGetNAtts(itup, state->rel),
+                                                                       P_ISLEAF(topaque) ? "heap" : "index",
+                                                                       (uint32) (state->targetlsn >> 32),
+                                                                       (uint32) state->targetlsn)));
         }
  
-
         /*
          * Loop over page items, starting from first non-highkey item, not high
-        * key (if any).  Also, immediately skip "negative infinity" real item (if
-        * any).
+        * key (if any).  Most tests are not performed for the "negative infinity"
+        * real item (if any).
          */
         for (offset = P_FIRSTDATAKEY(topaque);
                  offset <= max;
@@ -791,7 +786,7 @@ bt_target_page_check(BtreeCheckState *state)
                                                                                 tupsize, ItemIdGetLength(itemid),
                                                                                 (uint32) (state->targetlsn >> 32),
                                                                                 (uint32) state->targetlsn),
-                                        errhint("This could be a torn page problem")));
+                                        errhint("This could be a torn page problem.")));
  
                 /* Check the number of index tuple attributes */
                 if (!_bt_check_natts(state->rel, state->target, offset))
@@ -806,11 +801,11 @@ bt_target_page_check(BtreeCheckState *state)
  
                         ereport(ERROR,
                                         (errcode(ERRCODE_INDEX_CORRUPTED),
-                                        errmsg("wrong number of index tuple attributes for index \"%s\"",
+                                        errmsg("wrong number of index tuple attributes in index \"%s\"",
                                                         RelationGetRelationName(state->rel)),
                                          errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
                                                                                 itid,
-                                                                               BTreeTupGetNAtts(itup, state->rel),
+                                                                               BTreeTupleGetNAtts(itup, state->rel),
                                                                                 P_ISLEAF(topaque) ? "heap" : "index",
                                                                                 htid,
                                                                                 (uint32) (state->targetlsn >> 32),
@@ -818,8 +813,8 @@ bt_target_page_check(BtreeCheckState *state)
                 }
  
                 /*
-                * Don't try to generate scankey using "negative infinity" garbage
-                * data on internal pages
+                * Don't try to generate scankey using "negative infinity" item on
+                * internal pages. They are always truncated to zero attributes.
                  */
                 if (offset_is_negative_infinity(topaque, offset))
                         continue;
@@ -1430,9 +1425,9 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
          * infinity item is either first or second line item, or there is none
          * within page.
          *
-        * "Negative infinity" tuple is a special corner case of pivot tuples,
-        * it has zero attributes while rest of pivot tuples have nkeyatts number
-        * of attributes.
+        * Negative infinity items are a special case among pivot tuples.  They
+        * always have zero attributes, while all other pivot tuples always have
+        * nkeyatts attributes.
          *
          * Right-most pages don't have a high key, but could be said to
          * conceptually have a "positive infinity" high key.  Thus, there is a
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c

index 9b3e0a2e6ea29f562e3a0be821fcaa3ae2d0b642..ca690e522f757d0f3fb4835597124eb6e4fa6b58 100644 (file)
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -19,7 +19,6 @@
  #include "access/heapam.h"
  #include "access/itup.h"
  #include "access/tuptoaster.h"
-#include "utils/rel.h"
  
  
  /* ----------------------------------------------------------------
@@ -32,6 +31,9 @@
   *
   *             This shouldn't leak any memory; otherwise, callers such as
   *             tuplesort_putindextuplevalues() will be very unhappy.
+ *
+ *             This shouldn't perform external table access provided caller
+ *             does not pass values that are stored EXTERNAL.
   * ----------------
   */
  IndexTuple
@@ -448,30 +450,49 @@ CopyIndexTuple(IndexTuple source)
  }
  
  /*
- * Truncate tailing attributes from given index tuple leaving it with
- * new_indnatts number of attributes.
+ * Create a palloc'd copy of an index tuple, leaving only the first
+ * leavenatts attributes remaining.
+ *
+ * Truncation is guaranteed to result in an index tuple that is no
+ * larger than the original.  It is safe to use the IndexTuple with
+ * the original tuple descriptor, but caller must avoid actually
+ * accessing truncated attributes from returned tuple!  In practice
+ * this means that index_getattr() must be called with special care,
+ * and that the truncated tuple should only ever be accessed by code
+ * under caller's direct control.
+ *
+ * It's safe to call this function with a buffer lock held, since it
+ * never performs external table access.  If it ever became possible
+ * for index tuples to contain EXTERNAL TOAST values, then this would
+ * have to be revisited.
   */
  IndexTuple
-index_truncate_tuple(TupleDesc tupleDescriptor, IndexTuple olditup,
-                                        int new_indnatts)
+index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source,
+                                        int leavenatts)
  {
-       TupleDesc       itupdesc = CreateTupleDescCopyConstr(tupleDescriptor);
+       TupleDesc       truncdesc;
         Datum           values[INDEX_MAX_KEYS];
         bool            isnull[INDEX_MAX_KEYS];
-       IndexTuple      newitup;
+       IndexTuple      truncated;
  
-       Assert(tupleDescriptor->natts <= INDEX_MAX_KEYS);
-       Assert(new_indnatts > 0);
-       Assert(new_indnatts < tupleDescriptor->natts);
+       Assert(leavenatts < sourceDescriptor->natts);
  
-       index_deform_tuple(olditup, tupleDescriptor, values, isnull);
+       /* Create temporary descriptor to scribble on */
+       truncdesc = palloc(TupleDescSize(sourceDescriptor));
+       TupleDescCopy(truncdesc, sourceDescriptor);
+       truncdesc->natts = leavenatts;
  
-       /* form new tuple that will contain only key attributes */
-       itupdesc->natts = new_indnatts;
-       newitup = index_form_tuple(itupdesc, values, isnull);
-       newitup->t_tid = olditup->t_tid;
+       /* Deform, form copy of tuple with fewer attributes */
+       index_deform_tuple(source, truncdesc, values, isnull);
+       truncated = index_form_tuple(truncdesc, values, isnull);
+       truncated->t_tid = source->t_tid;
+       Assert(IndexTupleSize(truncated) <= IndexTupleSize(source));
+
+       /*
+        * Cannot leak memory here, TupleDescCopy() doesn't allocate any
+        * inner structure, so, plain pfree() should clean all allocated memory
+        */
+       pfree(truncdesc);
  
-       FreeTupleDesc(itupdesc);
-       Assert(IndexTupleSize(newitup) <= IndexTupleSize(olditup));
-       return newitup;
+       return truncated;
  }
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index 10509cfe8a55a9bc2071847631e14795f4bfdc03..dbd5c9238c5299ca8a9de5d00721671c1f628528 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -84,7 +84,7 @@ static void _bt_checksplitloc(FindSplitData *state,
                                   int dataitemstoleft, Size firstoldonrightsz);
  static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
                          OffsetNumber itup_off);
-static bool _bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
+static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
                         int keysz, ScanKey scankey);
  static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
  
@@ -343,6 +343,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                                  IndexUniqueCheck checkUnique, bool *is_unique,
                                  uint32 *speculativeToken)
  {
+       TupleDesc       itupdesc = RelationGetDescr(rel);
         int                     indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
         SnapshotData SnapshotDirty;
         OffsetNumber maxoff;
@@ -402,7 +403,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                                  * in real comparison, but only for ordering/finding items on
                                  * pages. - vadim 03/24/97
                                  */
-                               if (!_bt_isequal(rel, page, offset, indnkeyatts, itup_scankey))
+                               if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
                                         break;          /* we're past all the equal tuples */
  
                                 /* okay, we gotta fetch the heap tuple ... */
@@ -566,7 +567,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                         /* If scankey == hikey we gotta check the next page too */
                         if (P_RIGHTMOST(opaque))
                                 break;
-                       if (!_bt_isequal(rel, page, P_HIKEY,
+                       if (!_bt_isequal(itupdesc, page, P_HIKEY,
                                                          indnkeyatts, itup_scankey))
                                 break;
                         /* Advance to next non-dead page --- there must be one */
@@ -849,6 +850,13 @@ _bt_insertonpg(Relation rel,
  
         /* child buffer must be given iff inserting on an internal page */
         Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
+       /* tuple must have appropriate number of attributes */
+       Assert(!P_ISLEAF(lpageop) ||
+                  BTreeTupleGetNAtts(itup, rel) ==
+                  IndexRelationGetNumberOfAttributes(rel));
+       Assert(P_ISLEAF(lpageop) ||
+                  BTreeTupleGetNAtts(itup, rel) ==
+                  IndexRelationGetNumberOfKeyAttributes(rel));
  
         /* The caller should've finished any incomplete splits already. */
         if (P_INCOMPLETE_SPLIT(lpageop))
@@ -956,6 +964,18 @@ _bt_insertonpg(Relation rel,
                         }
                 }
  
+               /*
+                * Every internal page should have exactly one negative infinity item
+                * at all times.  Only _bt_split() and _bt_newroot() should add items
+                * that become negative infinity items through truncation, since
+                * they're the only routines that allocate new internal pages.  Do not
+                * allow a retail insertion of a new item at the negative infinity
+                * offset.
+                */
+               if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+                       elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
+                                itup_blkno, RelationGetRelationName(rel));
+
                 /* Do the update.  No ereport(ERROR) until changes are logged */
                 START_CRIT_SECTION();
  
@@ -1002,7 +1022,6 @@ _bt_insertonpg(Relation rel,
                         xl_btree_metadata xlmeta;
                         uint8           xlinfo;
                         XLogRecPtr      recptr;
-                       IndexTupleData trunctuple;
  
                         xlrec.offnum = itup_off;
  
@@ -1038,17 +1057,8 @@ _bt_insertonpg(Relation rel,
                                 xlinfo = XLOG_BTREE_INSERT_META;
                         }
  
-                       /* Read comments in _bt_pgaddtup */
                         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
-                       if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
-                       {
-                               trunctuple = *itup;
-                               trunctuple.t_info = sizeof(IndexTupleData);
-                               XLogRegisterBufData(0, (char *) &trunctuple,
-                                                                       sizeof(IndexTupleData));
-                       }
-                       else
-                               XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+                       XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
  
                         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
  
@@ -1203,6 +1213,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
                 itemid = PageGetItemId(origpage, P_HIKEY);
                 itemsz = ItemIdGetLength(itemid);
                 item = (IndexTuple) PageGetItem(origpage, itemid);
+               Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
                 if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
                                                 false, false) == InvalidOffsetNumber)
                 {
@@ -1235,20 +1246,25 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
         }
  
         /*
-        * We must truncate included attributes of the "high key" item, before
-        * insert it onto the leaf page.  It's the only point in insertion
-        * process, where we perform truncation.  All other functions work with
-        * this high key and do not change it.
+        * Truncate non-key (INCLUDE) attributes of the high key item before
+        * inserting it on the left page.  This only needs to happen at the leaf
+        * level, since in general all pivot tuple values originate from leaf
+        * level high keys.  This isn't just about avoiding unnecessary work,
+        * though; truncating unneeded key attributes (more aggressive suffix
+        * truncation) can only be performed at the leaf level anyway.  This is
+        * because a pivot tuple in a grandparent page must guide a search not
+        * only to the correct parent page, but also to the correct leaf page.
          */
         if (indnatts != indnkeyatts && isleaf)
         {
-               lefthikey = _bt_truncate_tuple(rel, item);
+               lefthikey = _bt_nonkey_truncate(rel, item);
                 itemsz = IndexTupleSize(lefthikey);
                 itemsz = MAXALIGN(itemsz);
         }
         else
                 lefthikey = item;
  
+       Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
         if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
                                         false, false) == InvalidOffsetNumber)
         {
@@ -1258,6 +1274,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
                          origpagenumber, RelationGetRelationName(rel));
         }
         leftoff = OffsetNumberNext(leftoff);
+       /* be tidy */
+       if (lefthikey != item)
+               pfree(lefthikey);
  
         /*
          * Now transfer all the data items to the appropriate page.
@@ -2143,7 +2162,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         left_item = (IndexTuple) palloc(left_item_sz);
         left_item->t_info = left_item_sz;
         BTreeInnerTupleSetDownLink(left_item, lbkno);
-       BTreeTupSetNAtts(left_item, 0);
+       BTreeTupleSetNAtts(left_item, 0);
  
         /*
          * Create downlink item for right page.  The key for it is obtained from
@@ -2180,6 +2199,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
          * Note: we *must* insert the two items in item-number order, for the
          * benefit of _bt_restore_page().
          */
+       Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
         if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
                                         false, false) == InvalidOffsetNumber)
                 elog(PANIC, "failed to add leftkey to new root page"
@@ -2189,6 +2209,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         /*
          * insert the right page pointer into the new root page.
          */
+       Assert(BTreeTupleGetNAtts(right_item, rel) ==
+                  IndexRelationGetNumberOfKeyAttributes(rel));
         if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
                                         false, false) == InvalidOffsetNumber)
                 elog(PANIC, "failed to add rightkey to new root page"
@@ -2284,7 +2306,7 @@ _bt_pgaddtup(Page page,
         {
                 trunctuple = *itup;
                 trunctuple.t_info = sizeof(IndexTupleData);
-               BTreeTupSetNAtts(&trunctuple, 0);
+               BTreeTupleSetNAtts(&trunctuple, 0);
                 itup = &trunctuple;
                 itemsize = sizeof(IndexTupleData);
         }
@@ -2303,10 +2325,9 @@ _bt_pgaddtup(Page page,
   * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
   */
  static bool
-_bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
+_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
                         int keysz, ScanKey scankey)
  {
-       TupleDesc       itupdesc = RelationGetDescr(idxrel);
         IndexTuple      itup;
         int                     i;
  
@@ -2316,16 +2337,11 @@ _bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
  
         /*
-        * Index tuple shouldn't be truncated.  Despite we technically could
-        * compare truncated tuple as well, this function should be only called
-        * for regular non-truncated leaf tuples and P_HIKEY tuple on
-        * rightmost leaf page.
+        * It's okay that we might perform a comparison against a truncated page
+        * high key when caller needs to determine if _bt_check_unique scan must
+        * continue on to the next page.  Caller never asks us to compare non-key
+        * attributes within an INCLUDE index.
          */
-       Assert((P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)) ||
-                               offnum != P_HIKEY)
-                  ?  BTreeTupGetNAtts(itup, idxrel) == itupdesc->natts
-                  : true);
-
         for (i = 1; i <= keysz; i++)
         {
                 AttrNumber      attno;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c

index ba6892591274b46af2c3f7d824a0b05f3636fb8f..beef089ba86a5777c7ac519dd871543941ce5c77 100644 (file)
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1605,6 +1605,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
                 ItemPointerSetBlockNumber(&trunctuple.t_tid, target);
         else
                 ItemPointerSetInvalid(&trunctuple.t_tid);
+       BTreeTupleSetNAtts(&trunctuple, 0);
+
         if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
                                         false, false) == InvalidOffsetNumber)
                 elog(ERROR, "could not add dummy high key to half-dead page");
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index 4c6fdcdd8aa79719e7c77f82f3bf73c1e4572278..0bcfa10b8647d66845a60d68094482551275aa8a 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -154,7 +154,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
                  * We need to save the location of the index entry we chose in the
                  * parent page on a stack. In case we split the tree, we'll use the
                  * stack to work back up to the parent page.  We also save the actual
-                * downlink (TID) to uniquely identify the index entry, in case it
+                * downlink (block) to uniquely identify the index entry, in case it
                  * moves right while we're working lower in the tree.  See the paper
                  * by Lehman and Yao for how this is detected and handled. (We use the
                  * child link to disambiguate duplicate keys in the index -- Lehman
@@ -436,14 +436,7 @@ _bt_compare(Relation rel,
         IndexTuple      itup;
         int                     i;
  
-       /*
-        * Check tuple has correct number of attributes.
-        */
-       if (unlikely(!_bt_check_natts(rel, page, offnum)))
-               ereport(ERROR,
-                               (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("tuple has wrong number of attributes in index \"%s\"",
-                                               RelationGetRelationName(rel))));
+       Assert(_bt_check_natts(rel, page, offnum));
  
         /*
          * Force result ">" if target item is first data item on an internal page
@@ -1968,51 +1961,3 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
         so->numKilled = 0;                      /* just paranoia */
         so->markItemIndex = -1;         /* ditto */
  }
-
-/*
- * Check if index tuple have appropriate number of attributes.
- */
-bool
-_bt_check_natts(Relation index, Page page, OffsetNumber offnum)
-{
-       int16           natts = IndexRelationGetNumberOfAttributes(index);
-       int16           nkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
-       ItemId          itemid;
-       IndexTuple      itup;
-       BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-       /*
-        * Assert that mask allocated for number of keys in index tuple can fit
-        * maximum number of index keys.
-        */
-       StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
-                                        "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
-
-       itemid = PageGetItemId(page, offnum);
-       itup = (IndexTuple) PageGetItem(page, itemid);
-
-       if (P_ISLEAF(opaque) && offnum >= P_FIRSTDATAKEY(opaque))
-       {
-               /*
-                * Regular leaf tuples have as every index attributes
-                */
-               return (BTreeTupGetNAtts(itup, index) == natts);
-       }
-       else if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
-       {
-               /*
-                * Leftmost tuples on non-leaf pages have no attributes, or haven't
-                * INDEX_ALT_TID_MASK set in pg_upgraded indexes.
-                */
-               return (BTreeTupGetNAtts(itup, index) == 0 ||
-                               ((itup->t_info & INDEX_ALT_TID_MASK) == 0));
-       }
-       else
-       {
-               /*
-                * Pivot tuples stored in non-leaf pages and hikeys of leaf pages
-                * contain only key attributes
-                */
-               return (BTreeTupGetNAtts(itup, index) == nkeyatts);
-       }
-}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index feba5e1c8fb62fc4f835f7beae51ff1a48251181..7deda9acac928da035bf0443d9cdc8c07833bdfd 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -752,7 +752,7 @@ _bt_sortaddtup(Page page,
         {
                 trunctuple = *itup;
                 trunctuple.t_info = sizeof(IndexTupleData);
-               BTreeTupSetNAtts(&trunctuple, 0);
+               BTreeTupleSetNAtts(&trunctuple, 0);
                 itup = &trunctuple;
                 itemsize = sizeof(IndexTupleData);
         }
@@ -790,7 +790,9 @@ _bt_sortaddtup(Page page,
   * placeholder for the pointer to the "high key" item; when we have
   * filled up the page, we will set linp0 to point to itemN and clear
   * linpN.  On the other hand, if we find this is the last (rightmost)
- * page, we leave the items alone and slide the linp array over.
+ * page, we leave the items alone and slide the linp array over.  If
+ * the high key is to be truncated, offset 1 is deleted, and we insert
+ * the truncated high key at offset 1.
   *
   * 'last' pointer indicates the last offset added to the page.
   *----------
@@ -803,7 +805,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
         OffsetNumber last_off;
         Size            pgspc;
         Size            itupsz;
-       BTPageOpaque pageop;
         int                     indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
         int                     indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
  
@@ -860,7 +861,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
                 ItemId          ii;
                 ItemId          hii;
                 IndexTuple      oitup;
-               IndexTuple      keytup;
                 BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage);
  
                 /* Create new page of same level */
@@ -891,25 +891,38 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
  
                 if (indnkeyatts != indnatts && P_ISLEAF(opageop))
                 {
+                       IndexTuple      truncated;
+                       Size            truncsz;
+
                         /*
-                        * We truncate included attributes of high key here.  Subsequent
-                        * insertions assume that hikey is already truncated, and so they
-                        * need not worry about it, when copying the high key into the
-                        * parent page as a downlink.
+                        * Truncate any non-key attributes from high key on leaf level
+                        * (i.e. truncate on leaf level if we're building an INCLUDE
+                        * index).  This is only done at the leaf level because
+                        * downlinks in internal pages are either negative infinity
+                        * items, or get their contents from copying from one level
+                        * down.  See also: _bt_split().
+                        *
+                        * Since the truncated tuple is probably smaller than the
+                        * original, it cannot just be copied in place (besides, we want
+                        * to actually save space on the leaf page).  We delete the
+                        * original high key, and add our own truncated high key at the
+                        * same offset.
                          *
-                        * The code above have just rearranged item pointers, but it
-                        * didn't save any space.  In order to save the space on page we
-                        * have to truly shift index tuples on the page.  But that's not
-                        * so bad for performance, because we operating pd_upper and don't
-                        * have to shift much of tuples memory.  Shift of ItemId's is
-                        * rather cheap, because they are small.
+                        * Note that the page layout won't be changed very much.  oitup
+                        * is already located at the physical beginning of tuple space,
+                        * so we only shift the line pointer array back and forth, and
+                        * overwrite the latter portion of the space occupied by the
+                        * original tuple.  This is fairly cheap.
                          */
-                       keytup = _bt_truncate_tuple(wstate->index, oitup);
-
-                       /* delete "wrong" high key, insert keytup as P_HIKEY. */
+                       truncated = _bt_nonkey_truncate(wstate->index, oitup);
+                       truncsz = IndexTupleSize(truncated);
                         PageIndexTupleDelete(opage, P_HIKEY);
+                       _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
+                       pfree(truncated);
  
-                       _bt_sortaddtup(opage, IndexTupleSize(keytup), keytup, P_HIKEY);
+                       /* oitup should continue to point to the page's high key */
+                       hii = PageGetItemId(opage, P_HIKEY);
+                       oitup = (IndexTuple) PageGetItem(opage, hii);
                 }
  
                 /*
@@ -920,7 +933,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
                 if (state->btps_next == NULL)
                         state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
  
-               Assert(state->btps_minkey != NULL);
+               Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
+                          IndexRelationGetNumberOfKeyAttributes(wstate->index));
                 BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
                 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
                 pfree(state->btps_minkey);
@@ -928,11 +942,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
                 /*
                  * Save a copy of the minimum key for the new page.  We have to copy
                  * it off the old page, not the new one, in case we are not at leaf
-                * level.  Despite oitup is already initialized, it's important to get
-                * high key from the page, since we could have replaced it with
-                * truncated copy.  See comment above.
+                * level.
                  */
-               oitup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, P_HIKEY));
                 state->btps_minkey = CopyIndexTuple(oitup);
  
                 /*
@@ -959,8 +970,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
                 last_off = P_FIRSTKEY;
         }
  
-       pageop = (BTPageOpaque) PageGetSpecialPointer(npage);
-
         /*
          * If the new item is the first for its page, stash a copy for later. Note
          * this will only happen for the first item on a level; on later pages,
@@ -969,14 +978,18 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
          */
         if (last_off == P_HIKEY)
         {
+               BTPageOpaque    npageop;
+
                 Assert(state->btps_minkey == NULL);
  
+               npageop = (BTPageOpaque) PageGetSpecialPointer(npage);
+
                 /*
                  * Truncate included attributes of the tuple that we're going to
                  * insert into the parent page as a downlink
                  */
-               if (indnkeyatts != indnatts && P_ISLEAF(pageop))
-                       state->btps_minkey = _bt_truncate_tuple(wstate->index, itup);
+               if (indnkeyatts != indnatts && P_ISLEAF(npageop))
+                       state->btps_minkey = _bt_nonkey_truncate(wstate->index, itup);
                 else
                         state->btps_minkey = CopyIndexTuple(itup);
         }
@@ -1030,7 +1043,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
                 }
                 else
                 {
-                       Assert(s->btps_minkey != NULL);
+                       Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
+                                  IndexRelationGetNumberOfKeyAttributes(wstate->index));
                         BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
                         _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
                         pfree(s->btps_minkey);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c

index 76ffa6b0d47b00f02a8316c0e208ec5ad43ad99f..0cecbf8e389837bbfc1e489df3d2547bac81eb3b 100644 (file)
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -73,14 +73,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
         indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
         indoption = rel->rd_indoption;
  
-       Assert(indnkeyatts != 0);
+       Assert(indnkeyatts > 0);
         Assert(indnkeyatts <= indnatts);
-       Assert(BTreeTupGetNAtts(itup, rel) == indnatts ||
-                  BTreeTupGetNAtts(itup, rel) == indnkeyatts);
+       Assert(BTreeTupleGetNAtts(itup, rel) == indnatts ||
+                  BTreeTupleGetNAtts(itup, rel) == indnkeyatts);
  
         /*
-        * We'll execute search using ScanKey constructed on key columns. Non key
-        * (included) columns must be omitted.
+        * We'll execute search using scan key constructed on key columns. Non-key
+        * (INCLUDE index) columns are always omitted from scan keys.
          */
         skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData));
  
@@ -1427,6 +1427,7 @@ _bt_checkkeys(IndexScanDesc scan,
                 bool            isNull;
                 Datum           test;
  
+               Assert(key->sk_attno <= BTreeTupleGetNAtts(tuple, scan->indexRelation));
                 /* row-comparison keys need special processing */
                 if (key->sk_flags & SK_ROW_HEADER)
                 {
@@ -2082,29 +2083,133 @@ btproperty(Oid index_oid, int attno,
  }
  
  /*
- *     _bt_truncate_tuple() -- remove non-key (INCLUDE) attributes from index
- *                                                     tuple.
+ *     _bt_nonkey_truncate() -- create tuple without non-key suffix attributes.
   *
- *     Transforms an ordinal B-tree leaf index tuple into pivot tuple to be used
- *     as hikey or non-leaf page tuple with downlink.  Note that t_tid offset
- *     will be overwritten in order to represent number of present tuple
- *     attributes.
+ * Returns truncated index tuple allocated in caller's memory context, with key
+ * attributes copied from caller's itup argument.  Currently, suffix truncation
+ * is only performed to create pivot tuples in INCLUDE indexes, but some day it
+ * could be generalized to remove suffix attributes after the first
+ * distinguishing key attribute.
+ *
+ * Truncated tuple is guaranteed to be no larger than the original, which is
+ * important for staying under the 1/3 of a page restriction on tuple size.
+ *
+ * Note that returned tuple's t_tid offset will hold the number of attributes
+ * present, so the original item pointer offset is not represented.  Caller
+ * should only change truncated tuple's downlink.
   */
  IndexTuple
-_bt_truncate_tuple(Relation idxrel, IndexTuple olditup)
+_bt_nonkey_truncate(Relation rel, IndexTuple itup)
  {
-       IndexTuple      newitup;
-       int                     nkeyattrs = IndexRelationGetNumberOfKeyAttributes(idxrel);
+       int                             nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel);
+       IndexTuple              truncated;
  
         /*
-        * We're assuming to truncate only regular leaf index tuples which have
-        * both key and non-key attributes.
+        * We should only ever truncate leaf index tuples, which must have both key
+        * and non-key attributes.  It's never okay to truncate a second time.
          */
-       Assert(BTreeTupGetNAtts(olditup, idxrel) == IndexRelationGetNumberOfAttributes(idxrel));
+       Assert(BTreeTupleGetNAtts(itup, rel) ==
+                  IndexRelationGetNumberOfAttributes(rel));
+
+       truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs);
+       BTreeTupleSetNAtts(truncated, nkeyattrs);
  
-       newitup = index_truncate_tuple(RelationGetDescr(idxrel),
-                                                                  olditup, nkeyattrs);
-       BTreeTupSetNAtts(newitup, nkeyattrs);
+       return truncated;
+}
+
+/*
+ *  _bt_check_natts() -- Verify tuple has expected number of attributes.
+ *
+ * Returns value indicating if the expected number of attributes were found
+ * for a particular offset on page.  This can be used as a general purpose
+ * sanity check.
+ *
+ * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
+ * preferred to calling here.  That's usually more convenient, and is always
+ * more explicit.  Call here instead when offnum's tuple may be a negative
+ * infinity tuple that uses the pre-v11 on-disk representation, or when a low
+ * context check is appropriate.
+ */
+bool
+_bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
+{
+       int16                   natts = IndexRelationGetNumberOfAttributes(rel);
+       int16                   nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+       BTPageOpaque    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       IndexTuple              itup;
  
-       return newitup;
+       /*
+        * We cannot reliably test a deleted or half-deleted page, since they have
+        * dummy high keys
+        */
+       if (P_IGNORE(opaque))
+               return true;
+
+       Assert(offnum >= FirstOffsetNumber &&
+                  offnum <= PageGetMaxOffsetNumber(page));
+       /*
+        * Mask allocated for number of keys in index tuple must be able to fit
+        * maximum possible number of index attributes
+        */
+       StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
+                                        "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+       if (P_ISLEAF(opaque))
+       {
+               if (offnum >= P_FIRSTDATAKEY(opaque))
+               {
+                       /*
+                        * Leaf tuples that are not the page high key (non-pivot tuples)
+                        * should never be truncated
+                        */
+                       return BTreeTupleGetNAtts(itup, rel) == natts;
+               }
+               else
+               {
+                       /*
+                        * Rightmost page doesn't contain a page high key, so tuple was
+                        * checked above as ordinary leaf tuple
+                        */
+                       Assert(!P_RIGHTMOST(opaque));
+
+                       /* Page high key tuple contains only key attributes */
+                       return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+               }
+       }
+       else  /* !P_ISLEAF(opaque) */
+       {
+               if (offnum == P_FIRSTDATAKEY(opaque))
+               {
+                       /*
+                        * The first tuple on any internal page (possibly the first after
+                        * its high key) is its negative infinity tuple.  Negative infinity
+                        * tuples are always truncated to zero attributes.  They are a
+                        * particular kind of pivot tuple.
+                        *
+                        * The number of attributes won't be explicitly represented if the
+                        * negative infinity tuple was generated during a page split that
+                        * occurred with a version of Postgres before v11.  There must be a
+                        * problem when there is an explicit representation that is
+                        * non-zero, or when there is no explicit representation and the
+                        * tuple is evidently not a pre-pg_upgrade tuple.
+                        *
+                        * Prior to v11, downlinks always had P_HIKEY as their offset.  Use
+                        * that to decide if the tuple is a pre-v11 tuple.
+                        */
+                       return BTreeTupleGetNAtts(itup, rel) == 0 ||
+                                       ((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
+                                        ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+               }
+               else
+               {
+                       /*
+                        * Tuple contains only key attributes despite on is it page high
+                        * key or not
+                        */
+                       return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+               }
+
+       }
  }
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c

index 0986ef07cf32a530098f5a6368476d12579cab7a..fb8c769df9ac5fff4c0101f10ef1c0d458f49db6 100644 (file)
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -248,17 +248,16 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
  
         _bt_restore_page(rpage, datapos, datalen);
  
-       /* Non-leaf page should always have its high key logged. */
-       Assert(isleaf || lhighkey);
-
         /*
          * When the high key isn't present is the wal record, then we assume it to
-        * be equal to the first key on the right page.
+        * be equal to the first key on the right page.  It must be from the leaf
+        * level.
          */
         if (!lhighkey)
         {
                 ItemId          hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
  
+               Assert(isleaf);
                 left_hikey = (IndexTuple) PageGetItem(rpage, hiItemId);
                 left_hikeysz = ItemIdGetLength(hiItemId);
         }
@@ -620,7 +619,7 @@ btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record)
                  * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
                  * Note that we are not looking at tuple data here, just headers.
                  */
-               hoffnum = ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid));
+               hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
                 hitemid = PageGetItemId(hpage, hoffnum);
  
                 /*
@@ -805,6 +804,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
                 ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
         else
                 ItemPointerSetInvalid(&trunctuple.t_tid);
+       BTreeTupleSetNAtts(&trunctuple, 0);
+
         if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
                                         false, false) == InvalidOffsetNumber)
                 elog(ERROR, "could not add dummy high key to half-dead page");
@@ -915,6 +916,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
                         ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
                 else
                         ItemPointerSetInvalid(&trunctuple.t_tid);
+               BTreeTupleSetNAtts(&trunctuple, 0);
+
                 if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
                                                 false, false) == InvalidOffsetNumber)
                         elog(ERROR, "could not add dummy high key to half-dead page");
diff --git a/src/include/access/itup.h b/src/include/access/itup.h

index 555434ca803cbef5ad6e2258ac655dd34414a72a..bd3a702380954bc69c9536876094249430cb7c72 100644 (file)
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -155,7 +155,7 @@ extern Datum nocache_index_getattr(IndexTuple tup, int attnum,
  extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
                                    Datum *values, bool *isnull);
  extern IndexTuple CopyIndexTuple(IndexTuple source);
-extern IndexTuple index_truncate_tuple(TupleDesc tupleDescriptor,
-                                        IndexTuple olditup, int new_indnatts);
+extern IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor,
+                                        IndexTuple source, int leavenatts);
  
  #endif                                                 /* ITUP_H */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index 36619b220f10bc85119c41c922d7b7a6d35cab9f..7aa6afbbb8077a962c04c2418125a07a2425edc2 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -186,59 +186,51 @@ typedef struct BTMetaPageData
  #define P_FIRSTKEY                     ((OffsetNumber) 2)
  #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
  
-
  /*
- * B-tree index with INCLUDE clause has non-key (included) attributes, which
- * are used solely in index-only scans.  Those non-key attributes are present
- * in leaf index tuples which point to corresponding heap tuples.  However,
- * tree also contains "pivot" tuples.  Pivot tuples are used for navigation
- * during tree traversal.  Pivot tuples include tuples on non-leaf pages and
- * high key tuples.  Such, tuples don't need to included attributes, because
- * they have no use during tree traversal.  This is why we truncate them in
- * order to save some space.  Therefore, B-tree index with INCLUDE clause
- * contain tuples with variable number of attributes.
- *
- * In order to keep on-disk compatibility with upcoming suffix truncation of
- * pivot tuples, we store number of attributes present inside tuple itself.
- * Thankfully, offset number is always unused in pivot tuple.  So, we use free
- * bit of index tuple flags as sign that offset have alternative meaning: it
- * stores number of keys present in index tuple (12 bit is far enough for that).
- * And we have 4 bits reserved for future usage.
+ * INCLUDE B-Tree indexes have non-key attributes.  These are extra
+ * attributes that may be returned by index-only scans, but do not influence
+ * the order of items in the index (formally, non-key attributes are not
+ * considered to be part of the key space).  Non-key attributes are only
+ * present in leaf index tuples whose item pointers actually point to heap
+ * tuples.  All other types of index tuples (collectively, "pivot" tuples)
+ * only have key attributes, since pivot tuples only ever need to represent
+ * how the key space is separated.  In general, any B-Tree index that has
+ * more than one level (i.e. any index that does not just consist of a
+ * metapage and a single leaf root page) must have some number of pivot
+ * tuples, since pivot tuples are used for traversing the tree.
   *
- * Right now INDEX_ALT_TID_MASK is set only on truncation of non-key
- * attributes of included indexes.  But potentially every pivot index tuple
- * might have INDEX_ALT_TID_MASK set.  Then this tuple should have number of
- * attributes correctly set in BT_N_KEYS_OFFSET_MASK, and in future it might
- * use some bits of BT_RESERVED_OFFSET_MASK.
+ * We store the number of attributes present inside pivot tuples by abusing
+ * their item pointer offset field, since pivot tuples never need to store a
+ * real offset (downlinks only need to store a block number).  The offset
+ * field only stores the number of attributes when the INDEX_ALT_TID_MASK
+ * bit is set (we never assume that pivot tuples must explicitly store the
+ * number of attributes, and currently do not bother storing the number of
+ * attributes unless indnkeyatts actually differs from indnatts).
+ * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's
+ * possible that it will be used within non-pivot tuples in the future.  Do
+ * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot
+ * tuple.
   *
- * Non-pivot tuples might also use bit of BT_RESERVED_OFFSET_MASK.  Despite
- * they store heap tuple offset, higher bits of offset are always free.
+ * The 12 least significant offset bits are used to represent the number of
+ * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved
+ * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should
+ * be large enough to store any number <= INDEX_MAX_KEYS.
   */
-#define INDEX_ALT_TID_MASK             INDEX_AM_RESERVED_BIT   /* flag indicating t_tid
-                                                                                                                * offset has an
-                                                                                                                * alternative meaning */
-#define BT_RESERVED_OFFSET_MASK        0xF000  /* mask of bits in t_tid offset
-                                                                                * reserved for future usage */
-#define BT_N_KEYS_OFFSET_MASK  0x0FFF  /* mask of bits in t_tid offset
-                                                                                * holding number of attributes
-                                                                                * actually present in index tuple */
-
-/* Acess to downlink block number */
+#define INDEX_ALT_TID_MASK                     INDEX_AM_RESERVED_BIT
+#define BT_RESERVED_OFFSET_MASK                0xF000
+#define BT_N_KEYS_OFFSET_MASK          0x0FFF
+
+/* Get/set downlink block number */
  #define BTreeInnerTupleGetDownLink(itup) \
         ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-
  #define BTreeInnerTupleSetDownLink(itup, blkno) \
         ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
  
-/* Set number of attributes to B-tree index tuple overriding t_tid offset */
-#define BTreeTupSetNAtts(itup, n) \
-       do { \
-               (itup)->t_info |= INDEX_ALT_TID_MASK; \
-               ItemPointerSetOffsetNumber(&(itup)->t_tid, n); \
-       } while(0)
-
-/* Get number of attributes in B-tree index tuple */
-#define BTreeTupGetNAtts(itup, index)  \
+/*
+ * Get/set number of attributes within B-tree index tuple. Asserts should be
+ * removed when BT_RESERVED_OFFSET_MASK bits will be used.
+ */
+#define BTreeTupleGetNAtts(itup, rel)  \
         ( \
                 (itup)->t_info & INDEX_ALT_TID_MASK ? \
                 ( \
@@ -246,8 +238,14 @@ typedef struct BTMetaPageData
                         ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
                 ) \
                 : \
-               IndexRelationGetNumberOfAttributes(index) \
+               IndexRelationGetNumberOfAttributes(rel) \
         )
+#define BTreeTupleSetNAtts(itup, n) \
+       do { \
+               (itup)->t_info |= INDEX_ALT_TID_MASK; \
+               Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \
+               ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
+       } while(0)
  
  /*
   *     Operator strategy numbers for B-tree have been moved to access/stratnum.h,
@@ -561,7 +559,6 @@ extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
  extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
  extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
                                  Snapshot snapshot);
-extern bool _bt_check_natts(Relation index, Page page, OffsetNumber offnum);
  
  /*
   * prototypes for functions in nbtutils.c
@@ -590,7 +587,8 @@ extern bytea *btoptions(Datum reloptions, bool validate);
  extern bool btproperty(Oid index_oid, int attno,
                    IndexAMProperty prop, const char *propname,
                    bool *res, bool *isnull);
-extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup);
+extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup);
+extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum);
  
  /*
   * prototypes for functions in nbtvalidate.c
diff --git a/src/test/regress/expected/index_including.out b/src/test/regress/expected/index_including.out

index 1d253ee77d636e6fdc4f39bc586eb6810b264a16..b7d1812ec5cd85d548aad927da617d6e362777a0 100644 (file)
--- a/src/test/regress/expected/index_including.out
+++ b/src/test/regress/expected/index_including.out
@@ -1,81 +1,78 @@
  /*
   * 1.test CREATE INDEX
+ *
+ * Deliberately avoid dropping objects in this section, to get some pg_dump
+ * coverage.
   */
  -- Regular index with included columns
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE INDEX tbl_idx ON tbl using btree (c1, c2) INCLUDE (c3,c4);
+CREATE TABLE tbl_include_reg (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_reg SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE INDEX tbl_include_reg_idx ON tbl_include_reg using btree (c1, c2) INCLUDE (c3,c4);
  -- must fail because of intersection of key and included columns
-CREATE INDEX tbl_idx ON tbl using btree (c1, c2) INCLUDE (c1,c3);
+CREATE INDEX tbl_include_reg_idx ON tbl_include_reg using btree (c1, c2) INCLUDE (c1,c3);
  ERROR:  included columns must not intersect with key columns
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-                             pg_get_indexdef                              
---------------------------------------------------------------------------
- CREATE INDEX tbl_idx ON public.tbl USING btree (c1, c2) INCLUDE (c3, c4)
+WHERE i.indrelid = 'tbl_include_reg'::regclass ORDER BY c.relname;
+                                         pg_get_indexdef                                          
+--------------------------------------------------------------------------------------------------
+ CREATE INDEX tbl_include_reg_idx ON public.tbl_include_reg USING btree (c1, c2) INCLUDE (c3, c4)
  (1 row)
  
-DROP TABLE tbl;
  -- Unique index and unique constraint
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ALTER TABLE tbl add UNIQUE USING INDEX tbl_idx_unique;
-ALTER TABLE tbl add UNIQUE (c1, c2) INCLUDE (c3, c4);
+CREATE TABLE tbl_include_unique1 (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_unique1 SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_unique1_idx_unique ON tbl_include_unique1 using btree (c1, c2) INCLUDE (c3, c4);
+ALTER TABLE tbl_include_unique1 add UNIQUE USING INDEX tbl_include_unique1_idx_unique;
+ALTER TABLE tbl_include_unique1 add UNIQUE (c1, c2) INCLUDE (c3, c4);
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-                                       pg_get_indexdef                                       
----------------------------------------------------------------------------------------------
- CREATE UNIQUE INDEX tbl_c1_c2_c3_c4_key ON public.tbl USING btree (c1, c2) INCLUDE (c3, c4)
- CREATE UNIQUE INDEX tbl_idx_unique ON public.tbl USING btree (c1, c2) INCLUDE (c3, c4)
+WHERE i.indrelid = 'tbl_include_unique1'::regclass ORDER BY c.relname;
+                                                       pg_get_indexdef                                                       
+-----------------------------------------------------------------------------------------------------------------------------
+ CREATE UNIQUE INDEX tbl_include_unique1_c1_c2_c3_c4_key ON public.tbl_include_unique1 USING btree (c1, c2) INCLUDE (c3, c4)
+ CREATE UNIQUE INDEX tbl_include_unique1_idx_unique ON public.tbl_include_unique1 USING btree (c1, c2) INCLUDE (c3, c4)
  (2 rows)
  
-DROP TABLE tbl;
  -- Unique index and unique constraint. Both must fail.
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ERROR:  could not create unique index "tbl_idx_unique"
+CREATE TABLE tbl_include_unique2 (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_unique2 SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_unique2_idx_unique ON tbl_include_unique2 using btree (c1, c2) INCLUDE (c3, c4);
+ERROR:  could not create unique index "tbl_include_unique2_idx_unique"
  DETAIL:  Key (c1, c2)=(1, 2) is duplicated.
-ALTER TABLE tbl add UNIQUE (c1, c2) INCLUDE (c3, c4);
-ERROR:  could not create unique index "tbl_c1_c2_c3_c4_key"
+ALTER TABLE tbl_include_unique2 add UNIQUE (c1, c2) INCLUDE (c3, c4);
+ERROR:  could not create unique index "tbl_include_unique2_c1_c2_c3_c4_key"
  DETAIL:  Key (c1, c2)=(1, 2) is duplicated.
-DROP TABLE tbl;
  -- PK constraint
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-ALTER TABLE tbl add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
+CREATE TABLE tbl_include_pk (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_pk SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+ALTER TABLE tbl_include_pk add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-                                 pg_get_indexdef                                  
-----------------------------------------------------------------------------------
- CREATE UNIQUE INDEX tbl_pkey ON public.tbl USING btree (c1, c2) INCLUDE (c3, c4)
+WHERE i.indrelid = 'tbl_include_pk'::regclass ORDER BY c.relname;
+                                            pg_get_indexdef                                             
+--------------------------------------------------------------------------------------------------------
+ CREATE UNIQUE INDEX tbl_include_pk_pkey ON public.tbl_include_pk USING btree (c1, c2) INCLUDE (c3, c4)
  (1 row)
  
-DROP TABLE tbl;
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ALTER TABLE tbl add PRIMARY KEY USING INDEX tbl_idx_unique;
+CREATE TABLE tbl_include_box (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_box SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_box_idx_unique ON tbl_include_box using btree (c1, c2) INCLUDE (c3, c4);
+ALTER TABLE tbl_include_box add PRIMARY KEY USING INDEX tbl_include_box_idx_unique;
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-                                    pg_get_indexdef                                     
-----------------------------------------------------------------------------------------
- CREATE UNIQUE INDEX tbl_idx_unique ON public.tbl USING btree (c1, c2) INCLUDE (c3, c4)
+WHERE i.indrelid = 'tbl_include_box'::regclass ORDER BY c.relname;
+                                                pg_get_indexdef                                                 
+----------------------------------------------------------------------------------------------------------------
+ CREATE UNIQUE INDEX tbl_include_box_idx_unique ON public.tbl_include_box USING btree (c1, c2) INCLUDE (c3, c4)
  (1 row)
  
-DROP TABLE tbl;
  -- PK constraint. Must fail.
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-ALTER TABLE tbl add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
-ERROR:  could not create unique index "tbl_pkey"
+CREATE TABLE tbl_include_box_pk (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_box_pk SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+ALTER TABLE tbl_include_box_pk add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
+ERROR:  could not create unique index "tbl_include_box_pk_pkey"
  DETAIL:  Key (c1, c2)=(1, 2) is duplicated.
-DROP TABLE tbl;
  /*
   * 2. Test CREATE TABLE with constraint
   */
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out

index ac0fb539e98ff0f16c5a0d11bd66761e69230cd4..8afb1f2f7e3a77d521f9a3fd36c4166b50779235 100644 (file)
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -185,6 +185,12 @@ sql_sizing|f
  sql_sizing_profiles|f
  stud_emp|f
  student|f
+tbl_include_box|t
+tbl_include_box_pk|f
+tbl_include_pk|t
+tbl_include_reg|t
+tbl_include_unique1|t
+tbl_include_unique2|f
  tenk1|t
  tenk2|t
  test_range_excl|t
diff --git a/src/test/regress/sql/index_including.sql b/src/test/regress/sql/index_including.sql

index caedc9866d3822fb5d9402b65582473eb7c5a7c9..f83b2c64ac62d0ff52c332a9f2c8141e714ad5fa 100644 (file)
--- a/src/test/regress/sql/index_including.sql
+++ b/src/test/regress/sql/index_including.sql
@@ -1,59 +1,56 @@
  /*
   * 1.test CREATE INDEX
+ *
+ * Deliberately avoid dropping objects in this section, to get some pg_dump
+ * coverage.
   */
  
  -- Regular index with included columns
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE INDEX tbl_idx ON tbl using btree (c1, c2) INCLUDE (c3,c4);
+CREATE TABLE tbl_include_reg (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_reg SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE INDEX tbl_include_reg_idx ON tbl_include_reg using btree (c1, c2) INCLUDE (c3,c4);
  -- must fail because of intersection of key and included columns
-CREATE INDEX tbl_idx ON tbl using btree (c1, c2) INCLUDE (c1,c3);
+CREATE INDEX tbl_include_reg_idx ON tbl_include_reg using btree (c1, c2) INCLUDE (c1,c3);
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-DROP TABLE tbl;
+WHERE i.indrelid = 'tbl_include_reg'::regclass ORDER BY c.relname;
  
  -- Unique index and unique constraint
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ALTER TABLE tbl add UNIQUE USING INDEX tbl_idx_unique;
-ALTER TABLE tbl add UNIQUE (c1, c2) INCLUDE (c3, c4);
+CREATE TABLE tbl_include_unique1 (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_unique1 SELECT x, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_unique1_idx_unique ON tbl_include_unique1 using btree (c1, c2) INCLUDE (c3, c4);
+ALTER TABLE tbl_include_unique1 add UNIQUE USING INDEX tbl_include_unique1_idx_unique;
+ALTER TABLE tbl_include_unique1 add UNIQUE (c1, c2) INCLUDE (c3, c4);
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-DROP TABLE tbl;
+WHERE i.indrelid = 'tbl_include_unique1'::regclass ORDER BY c.relname;
  
  -- Unique index and unique constraint. Both must fail.
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ALTER TABLE tbl add UNIQUE (c1, c2) INCLUDE (c3, c4);
-DROP TABLE tbl;
+CREATE TABLE tbl_include_unique2 (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_unique2 SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_unique2_idx_unique ON tbl_include_unique2 using btree (c1, c2) INCLUDE (c3, c4);
+ALTER TABLE tbl_include_unique2 add UNIQUE (c1, c2) INCLUDE (c3, c4);
  
  -- PK constraint
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-ALTER TABLE tbl add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
+CREATE TABLE tbl_include_pk (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_pk SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+ALTER TABLE tbl_include_pk add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-DROP TABLE tbl;
+WHERE i.indrelid = 'tbl_include_pk'::regclass ORDER BY c.relname;
  
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-CREATE UNIQUE INDEX tbl_idx_unique ON tbl using btree (c1, c2) INCLUDE (c3, c4);
-ALTER TABLE tbl add PRIMARY KEY USING INDEX tbl_idx_unique;
+CREATE TABLE tbl_include_box (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_box SELECT 1, 2*x, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+CREATE UNIQUE INDEX tbl_include_box_idx_unique ON tbl_include_box using btree (c1, c2) INCLUDE (c3, c4);
+ALTER TABLE tbl_include_box add PRIMARY KEY USING INDEX tbl_include_box_idx_unique;
  SELECT pg_get_indexdef(i.indexrelid)
  FROM pg_index i JOIN pg_class c ON i.indexrelid = c.oid
-WHERE i.indrelid = 'tbl'::regclass ORDER BY c.relname;
-DROP TABLE tbl;
+WHERE i.indrelid = 'tbl_include_box'::regclass ORDER BY c.relname;
  
  -- PK constraint. Must fail.
-CREATE TABLE tbl (c1 int, c2 int, c3 int, c4 box);
-INSERT INTO tbl SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
-ALTER TABLE tbl add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
-DROP TABLE tbl;
+CREATE TABLE tbl_include_box_pk (c1 int, c2 int, c3 int, c4 box);
+INSERT INTO tbl_include_box_pk SELECT 1, 2, 3*x, box('4,4,4,4') FROM generate_series(1,10) AS x;
+ALTER TABLE tbl_include_box_pk add PRIMARY KEY (c1, c2) INCLUDE (c3, c4);
  
  
  /*
author	Teodor Sigaev <teodor@sigaev.ru>
	Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Thu, 19 Apr 2018 05:45:58 +0000 (08:45 +0300)
contrib/amcheck/verify_nbtree.c		patch \| blob \| blame \| history
src/backend/access/common/indextuple.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtpage.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsearch.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsort.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtutils.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtxlog.c		patch \| blob \| blame \| history
src/include/access/itup.h		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/test/regress/expected/index_including.out		patch \| blob \| blame \| history
src/test/regress/expected/sanity_check.out		patch \| blob \| blame \| history
src/test/regress/sql/index_including.sql		patch \| blob \| blame \| history