Use full 64-bit XIDs in deleted nbtree pages.

author Peter Geoghegan <pg@bowt.ie>

Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)

committer Peter Geoghegan <pg@bowt.ie>

Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)
author Peter Geoghegan <pg@bowt.ie>
Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)
committer Peter Geoghegan <pg@bowt.ie>
Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c

index a5a76278391ae8ad0bf955d16846aedacb3ea6a4..c4ca6339182729267c75712588894f93502a69d2 100644 (file)
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
                                               P_FIRSTDATAKEY(opaque));
                 itup = (IndexTuple) PageGetItem(state->target, itemid);
                 nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
-               nextleveldown.level = opaque->btpo.level - 1;
+               nextleveldown.level = opaque->btpo_level - 1;
             }
             else
             {
@@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
         if (opaque->btpo_prev != leftcurrent)
             bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
  
-       /* Check level, which must be valid for non-ignorable page */
-       if (level.level != opaque->btpo.level)
+       /* Check level */
+       if (level.level != opaque->btpo_level)
             ereport(ERROR,
                     (errcode(ERRCODE_INDEX_CORRUPTED),
                      errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
                             RelationGetRelationName(state->rel)),
                      errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-                                       current, level.level, opaque->btpo.level)));
+                                       current, level.level, opaque->btpo_level)));
  
         /* Verify invariants for page */
         bt_target_page_check(state);
@@ -1164,7 +1164,7 @@ bt_target_page_check(BtreeCheckState *state)
                 bt_child_highkey_check(state,
                                        offset,
                                        NULL,
-                                      topaque->btpo.level);
+                                      topaque->btpo_level);
             }
             continue;
         }
@@ -1520,7 +1520,7 @@ bt_target_page_check(BtreeCheckState *state)
     if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
     {
         bt_child_highkey_check(state, InvalidOffsetNumber,
-                              NULL, topaque->btpo.level);
+                              NULL, topaque->btpo_level);
     }
  }
  
@@ -1597,7 +1597,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
         ereport(DEBUG1,
                 (errcode(ERRCODE_NO_DATA),
                  errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead",
-                       opaque->btpo.level, RelationGetRelationName(state->rel)),
+                       opaque->btpo_level, RelationGetRelationName(state->rel)),
                  errdetail_internal("Deleted page found when building scankey from right sibling.")));
  
         /* Be slightly more pro-active in freeing this memory, just in case */
@@ -1900,14 +1900,15 @@ bt_child_highkey_check(BtreeCheckState *state,
                                         state->targetblock, blkno,
                                         LSN_FORMAT_ARGS(state->targetlsn))));
  
-       /* Check level for non-ignorable page */
-       if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
+       /* Do level sanity check */
+       if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
+           opaque->btpo_level != target_level - 1)
             ereport(ERROR,
                     (errcode(ERRCODE_INDEX_CORRUPTED),
                      errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
                             RelationGetRelationName(state->rel)),
                      errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-                                       blkno, target_level - 1, opaque->btpo.level)));
+                                       blkno, target_level - 1, opaque->btpo_level)));
  
         /* Try to detect circular links */
         if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
@@ -2132,7 +2133,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
      * check for downlink connectivity.
      */
     bt_child_highkey_check(state, downlinkoffnum,
-                          child, topaque->btpo.level);
+                          child, topaque->btpo_level);
  
     /*
      * Since there cannot be a concurrent VACUUM operation in readonly mode,
@@ -2275,7 +2276,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
                  errmsg_internal("harmless interrupted page split detected in index %s",
                         RelationGetRelationName(state->rel)),
                  errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
-                                   blkno, opaque->btpo.level,
+                                   blkno, opaque->btpo_level,
                                     opaque->btpo_prev,
                                     LSN_FORMAT_ARGS(pagelsn))));
         return;
@@ -2304,7 +2305,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
     elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
          RelationGetRelationName(state->rel));
  
-   level = opaque->btpo.level;
+   level = opaque->btpo_level;
     itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
     itup = (IndexTuple) PageGetItem(page, itemid);
     childblk = BTreeTupleGetDownLink(itup);
@@ -2319,16 +2320,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
             break;
  
         /* Do an extra sanity check in passing on internal pages */
-       if (copaque->btpo.level != level - 1)
+       if (copaque->btpo_level != level - 1)
             ereport(ERROR,
                     (errcode(ERRCODE_INDEX_CORRUPTED),
                      errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
                                      RelationGetRelationName(state->rel)),
                      errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
                                         blkno, childblk,
-                                       level - 1, copaque->btpo.level)));
+                                       level - 1, copaque->btpo_level)));
  
-       level = copaque->btpo.level;
+       level = copaque->btpo_level;
         itemid = PageGetItemIdCareful(state, childblk, child,
                                       P_FIRSTDATAKEY(copaque));
         itup = (IndexTuple) PageGetItem(child, itemid);
@@ -2389,7 +2390,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
              errmsg("internal index block lacks downlink in index \"%s\"",
                     RelationGetRelationName(state->rel)),
              errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
-                               blkno, opaque->btpo.level,
+                               blkno, opaque->btpo_level,
                                 LSN_FORMAT_ARGS(pagelsn))));
  }
  
@@ -2983,21 +2984,28 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
     }
  
     /*
-    * Deleted pages have no sane "level" field, so can only check non-deleted
-    * page level
+    * Deleted pages that still use the old 32-bit XID representation have no
+    * sane "level" field because they type pun the field, but all other pages
+    * (including pages deleted on Postgres 14+) have a valid value.
      */
-   if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
-       ereport(ERROR,
-               (errcode(ERRCODE_INDEX_CORRUPTED),
-                errmsg("invalid leaf page level %u for block %u in index \"%s\"",
-                       opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
+   if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
+   {
+       /* Okay, no reason not to trust btpo_level field from page */
  
-   if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
-       opaque->btpo.level == 0)
-       ereport(ERROR,
-               (errcode(ERRCODE_INDEX_CORRUPTED),
-                errmsg("invalid internal page level 0 for block %u in index \"%s\"",
-                       blocknum, RelationGetRelationName(state->rel))));
+       if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
+           ereport(ERROR,
+                   (errcode(ERRCODE_INDEX_CORRUPTED),
+                    errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
+                                    opaque->btpo_level, blocknum,
+                                    RelationGetRelationName(state->rel))));
+
+       if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
+           ereport(ERROR,
+                   (errcode(ERRCODE_INDEX_CORRUPTED),
+                    errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
+                                    blocknum,
+                                    RelationGetRelationName(state->rel))));
+   }
  
     /*
      * Sanity checks for number of items on page.
@@ -3044,8 +3052,6 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
      * state.  This state is nonetheless treated as corruption by VACUUM on
      * from version 9.4 on, so do the same here.  See _bt_pagedel() for full
      * details.
-    *
-    * Internal pages should never have garbage items, either.
      */
     if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
         ereport(ERROR,
@@ -3054,11 +3060,27 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
                         blocknum, RelationGetRelationName(state->rel)),
                  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
  
+   /*
+    * Check that internal pages have no garbage items, and that no page has
+    * an invalid combination of deletion-related page level flags
+    */
     if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
         ereport(ERROR,
                 (errcode(ERRCODE_INDEX_CORRUPTED),
-                errmsg("internal page block %u in index \"%s\" has garbage items",
-                       blocknum, RelationGetRelationName(state->rel))));
+                errmsg_internal("internal page block %u in index \"%s\" has garbage items",
+                                blocknum, RelationGetRelationName(state->rel))));
+
+   if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
+       ereport(ERROR,
+               (errcode(ERRCODE_INDEX_CORRUPTED),
+                errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
+                                blocknum, RelationGetRelationName(state->rel))));
+
+   if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
+       ereport(ERROR,
+               (errcode(ERRCODE_INDEX_CORRUPTED),
+                errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
+                                blocknum, RelationGetRelationName(state->rel))));
  
     return page;
  }
diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c

index 8bb180bbbe0efa33f49233e6cb90442fbe6eefe1..b7725b572f0d103c1746abef961caa8e3cd7fd15 100644 (file)
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -75,11 +75,7 @@ typedef struct BTPageStat
     /* opaque data */
     BlockNumber btpo_prev;
     BlockNumber btpo_next;
-   union
-   {
-       uint32      level;
-       TransactionId xact;
-   }           btpo;
+   uint32      btpo_level;
     uint16      btpo_flags;
     BTCycleId   btpo_cycleid;
  } BTPageStat;
@@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
     /* page type (flags) */
     if (P_ISDELETED(opaque))
     {
-       stat->type = 'd';
-       stat->btpo.xact = opaque->btpo.xact;
-       return;
+       /* We divide deleted pages into leaf ('d') or internal ('D') */
+       if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
+           stat->type = 'd';
+       else
+           stat->type = 'D';
+
+       /*
+        * Report safexid in a deleted page.
+        *
+        * Handle pg_upgrade'd deleted pages that used the previous safexid
+        * representation in btpo_level field (this used to be a union type
+        * called "bpto").
+        */
+       if (P_HAS_FULLXID(opaque))
+       {
+           FullTransactionId safexid = BTPageGetDeleteXid(page);
+
+           elog(NOTICE, "deleted page from block %u has safexid %u:%u",
+                blkno, EpochFromFullTransactionId(safexid),
+                XidFromFullTransactionId(safexid));
+       }
+       else
+           elog(NOTICE, "deleted page from block %u has safexid %u",
+                blkno, opaque->btpo_level);
+
+       /* Don't interpret BTDeletedPageData as index tuples */
+       maxoff = InvalidOffsetNumber;
     }
     else if (P_IGNORE(opaque))
         stat->type = 'e';
@@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
     /* btpage opaque data */
     stat->btpo_prev = opaque->btpo_prev;
     stat->btpo_next = opaque->btpo_next;
-   stat->btpo.level = opaque->btpo.level;
+   stat->btpo_level = opaque->btpo_level;
     stat->btpo_flags = opaque->btpo_flags;
     stat->btpo_cycleid = opaque->btpo_cycleid;
  
@@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
     values[j++] = psprintf("%u", stat.free_size);
     values[j++] = psprintf("%u", stat.btpo_prev);
     values[j++] = psprintf("%u", stat.btpo_next);
-   values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
+   values[j++] = psprintf("%u", stat.btpo_level);
     values[j++] = psprintf("%d", stat.btpo_flags);
  
     tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
@@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
  
         opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
  
-       if (P_ISDELETED(opaque))
-           elog(NOTICE, "page is deleted");
-
-       fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+       if (!P_ISDELETED(opaque))
+           fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+       else
+       {
+           /* Don't interpret BTDeletedPageData as index tuples */
+           elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
+           fctx->max_calls = 0;
+       }
         uargs->leafpage = P_ISLEAF(opaque);
         uargs->rightmost = P_RIGHTMOST(opaque);
  
@@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
         if (P_ISDELETED(opaque))
             elog(NOTICE, "page is deleted");
  
-       fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+       if (!P_ISDELETED(opaque))
+           fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+       else
+       {
+           /* Don't interpret BTDeletedPageData as index tuples */
+           elog(NOTICE, "page from block is deleted");
+           fctx->max_calls = 0;
+       }
         uargs->leafpage = P_ISLEAF(opaque);
         uargs->rightmost = P_RIGHTMOST(opaque);
  
@@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS)
  
     /*
      * We need a kluge here to detect API versions prior to 1.8.  Earlier
-    * versions incorrectly used int4 for certain columns.  This caused
-    * various problems.  For example, an int4 version of the "oldest_xact"
-    * column would not work with TransactionId values that happened to exceed
-    * PG_INT32_MAX.
+    * versions incorrectly used int4 for certain columns.
      *
      * There is no way to reliably avoid the problems created by the old
      * function definition at this point, so insist that the user update the
@@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS)
      */
     if (metad->btm_version >= BTREE_NOVAC_VERSION)
     {
-       values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+       values[j++] = psprintf(INT64_FORMAT,
+                              (int64) metad->btm_last_cleanup_num_delpages);
         values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
         values[j++] = metad->btm_allequalimage ? "t" : "f";
     }
diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out

index a7632be36a116bdbbe5bb78b5f7617d99d0e1bcf..c60bc88560ccab8837491dd4d9cba3065dc5a8da 100644 (file)
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@@ -3,16 +3,16 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
  CREATE INDEX test1_a_idx ON test1 USING btree (a);
  \x
  SELECT * FROM bt_metap('test1_a_idx');
--[ RECORD 1 ]-----------+-------
-magic                   | 340322
-version                 | 4
-root                    | 1
-level                   | 0
-fastroot                | 1
-fastlevel               | 0
-oldest_xact             | 0
-last_cleanup_num_tuples | -1
-allequalimage           | t
+-[ RECORD 1 ]-------------+-------
+magic                     | 340322
+version                   | 4
+root                      | 1
+level                     | 0
+fastroot                  | 1
+fastlevel                 | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples   | -1
+allequalimage             | t
  
  SELECT * FROM bt_page_stats('test1_a_idx', -1);
  ERROR:  invalid block number
@@ -29,7 +29,7 @@ page_size     | 8192
  free_size     | 8128
  btpo_prev     | 0
  btpo_next     | 0
-btpo          | 0
+btpo_level    | 0
  btpo_flags    | 3
  
  SELECT * FROM bt_page_stats('test1_a_idx', 2);
diff --git a/contrib/pageinspect/pageinspect--1.8--1.9.sql b/contrib/pageinspect/pageinspect--1.8--1.9.sql

index 79a42a7b11e03caff64e67f80390a38b0559058f..be89a64ca14028561bbf8146f5e16baa016b1875 100644 (file)
--- a/contrib/pageinspect/pageinspect--1.8--1.9.sql
+++ b/contrib/pageinspect/pageinspect--1.8--1.9.sql
@@ -66,6 +66,23 @@ RETURNS smallint
  AS 'MODULE_PATHNAME', 'page_checksum_1_9'
  LANGUAGE C STRICT PARALLEL SAFE;
  
+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(text);
+CREATE FUNCTION bt_metap(IN relname text,
+    OUT magic int4,
+    OUT version int4,
+    OUT root int8,
+    OUT level int8,
+    OUT fastroot int8,
+    OUT fastlevel int8,
+    OUT last_cleanup_num_delpages int8,
+    OUT last_cleanup_num_tuples float8,
+    OUT allequalimage boolean)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
+
  --
  -- bt_page_stats()
  --
@@ -80,7 +97,7 @@ CREATE FUNCTION bt_page_stats(IN relname text, IN blkno int8,
      OUT free_size int4,
      OUT btpo_prev int8,
      OUT btpo_next int8,
-    OUT btpo int4,
+    OUT btpo_level int8,
      OUT btpo_flags int4)
  AS 'MODULE_PATHNAME', 'bt_page_stats_1_9'
  LANGUAGE C STRICT PARALLEL SAFE;
diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c

index b1ce0d77d7378cb632ec6728ce93bb6408358b55..5368bb30f0c556593cbb14e56d3543763086b4f7 100644 (file)
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
         page = BufferGetPage(buffer);
         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
-       /* Determine page type, and update totals */
-
+       /*
+        * Determine page type, and update totals.
+        *
+        * Note that we arbitrarily bucket deleted pages together without
+        * considering if they're leaf pages or internal pages.
+        */
         if (P_ISDELETED(opaque))
             indexStat.deleted_pages++;
         else if (P_IGNORE(opaque))
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index d7a73767984eb84831f9d2695a754e2a02be57c8..b5718fc1366055351ab094eeed239f74c1010ff7 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8529,11 +8529,10 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
  
         <para>
          If no tuples were deleted from the heap, B-tree indexes are still
-        scanned at the <command>VACUUM</command> cleanup stage when at least one
-        of the following conditions is met: the index statistics are stale, or
-        the index contains deleted pages that can be recycled during cleanup.
-        Index statistics are considered to be stale if the number of newly
-        inserted tuples exceeds the <varname>vacuum_cleanup_index_scale_factor</varname>
+        scanned at the <command>VACUUM</command> cleanup stage when the
+        index's statistics are stale.  Index statistics are considered
+        stale if the number of newly inserted tuples exceeds the
+        <varname>vacuum_cleanup_index_scale_factor</varname>
          fraction of the total number of heap tuples detected by the previous
          statistics collection. The total number of heap tuples is stored in
          the index meta-page. Note that the meta-page does not include this data
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml

index e29eb0783ab8994ee172615943b1ad6a8c0d5791..59620faec00f9404607713668d053579546bcd5f 100644 (file)
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -298,16 +298,16 @@ test=# SELECT t_ctid, raw_flags, combined_flags
        index's metapage.  For example:
  <screen>
  test=# SELECT * FROM bt_metap('pg_cast_oid_index');
--[ RECORD 1 ]-----------+-------
-magic                   | 340322
-version                 | 4
-root                    | 1
-level                   | 0
-fastroot                | 1
-fastlevel               | 0
-oldest_xact             | 582
-last_cleanup_num_tuples | 1000
-allequalimage           | f
+-[ RECORD 1 ]-------------+-------
+magic                     | 340322
+version                   | 4
+root                      | 1
+level                     | 0
+fastroot                  | 1
+fastlevel                 | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples   | 230
+allequalimage             | f
  </screen>
       </para>
      </listitem>
@@ -337,7 +337,7 @@ page_size     | 8192
  free_size     | 3668
  btpo_prev     | 0
  btpo_next     | 0
-btpo          | 0
+btpo_level    | 0
  btpo_flags    | 3
  </screen>
       </para>
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c

index f2eda79bc1ab95df5bd8fcd0ad7f2b2cd4bd3fdd..1c80eae044a94a55d2a79b57c9a2c7f91b46dda2 100644 (file)
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record)
      * same exclusion effect on primary and standby.
      */
     if (InHotStandby)
-   {
-       FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
-       FullTransactionId nextXid = ReadNextFullTransactionId();
-       uint64      diff;
-
-       /*
-        * ResolveRecoveryConflictWithSnapshot operates on 32-bit
-        * TransactionIds, so truncate the logged FullTransactionId. If the
-        * logged value is very old, so that XID wrap-around already happened
-        * on it, there can't be any snapshots that still see it.
-        */
-       diff = U64FromFullTransactionId(nextXid) -
-           U64FromFullTransactionId(latestRemovedFullXid);
-       if (diff < MaxTransactionId / 2)
-       {
-           TransactionId latestRemovedXid;
-
-           latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
-           ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
-                                               xlrec->node);
-       }
-   }
+       ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+                                                  xlrec->node);
  }
  
  void
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index e3336039125c96c1d1beb9c4111c827d948ea1b0..1edb9f95797d713694a6fe88bc6f5541f70381f3 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1241,7 +1241,7 @@ _bt_insertonpg(Relation rel,
             metapg = BufferGetPage(metabuf);
             metad = BTPageGetMeta(metapg);
  
-           if (metad->btm_fastlevel >= opaque->btpo.level)
+           if (metad->btm_fastlevel >= opaque->btpo_level)
             {
                 /* no update wanted */
                 _bt_relbuf(rel, metabuf);
@@ -1268,7 +1268,7 @@ _bt_insertonpg(Relation rel,
             if (metad->btm_version < BTREE_NOVAC_VERSION)
                 _bt_upgrademetapage(metapg);
             metad->btm_fastroot = BufferGetBlockNumber(buf);
-           metad->btm_fastlevel = opaque->btpo.level;
+           metad->btm_fastlevel = opaque->btpo_level;
             MarkBufferDirty(metabuf);
         }
  
@@ -1331,7 +1331,7 @@ _bt_insertonpg(Relation rel,
                     xlmeta.level = metad->btm_level;
                     xlmeta.fastroot = metad->btm_fastroot;
                     xlmeta.fastlevel = metad->btm_fastlevel;
-                   xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+                   xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
                     xlmeta.last_cleanup_num_heap_tuples =
                         metad->btm_last_cleanup_num_heap_tuples;
                     xlmeta.allequalimage = metad->btm_allequalimage;
@@ -1537,7 +1537,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
     lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
     lopaque->btpo_prev = oopaque->btpo_prev;
     /* handle btpo_next after rightpage buffer acquired */
-   lopaque->btpo.level = oopaque->btpo.level;
+   lopaque->btpo_level = oopaque->btpo_level;
     /* handle btpo_cycleid after rightpage buffer acquired */
  
     /*
@@ -1722,7 +1722,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
     ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
     ropaque->btpo_prev = origpagenumber;
     ropaque->btpo_next = oopaque->btpo_next;
-   ropaque->btpo.level = oopaque->btpo.level;
+   ropaque->btpo_level = oopaque->btpo_level;
     ropaque->btpo_cycleid = lopaque->btpo_cycleid;
  
     /*
@@ -1950,7 +1950,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
         uint8       xlinfo;
         XLogRecPtr  recptr;
  
-       xlrec.level = ropaque->btpo.level;
+       xlrec.level = ropaque->btpo_level;
         /* See comments below on newitem, orignewitem, and posting lists */
         xlrec.firstrightoff = firstrightoff;
         xlrec.newitemoff = newitemoff;
@@ -2142,7 +2142,7 @@ _bt_insert_parent(Relation rel,
                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
  
             /* Find the leftmost page at the next level up */
-           pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL);
+           pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
             /* Set up a phony stack entry pointing there */
             stack = &fakestack;
             stack->bts_blkno = BufferGetBlockNumber(pbuf);
@@ -2480,15 +2480,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
     rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     rootopaque->btpo_flags = BTP_ROOT;
-   rootopaque->btpo.level =
-       ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+   rootopaque->btpo_level =
+       ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
     rootopaque->btpo_cycleid = 0;
  
     /* update metapage data */
     metad->btm_root = rootblknum;
-   metad->btm_level = rootopaque->btpo.level;
+   metad->btm_level = rootopaque->btpo_level;
     metad->btm_fastroot = rootblknum;
-   metad->btm_fastlevel = rootopaque->btpo.level;
+   metad->btm_fastlevel = rootopaque->btpo_level;
  
     /*
      * Insert the left page pointer into the new root page.  The root page is
@@ -2548,7 +2548,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         md.level = metad->btm_level;
         md.fastroot = rootblknum;
         md.fastlevel = metad->btm_level;
-       md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+       md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
         md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
         md.allequalimage = metad->btm_allequalimage;
  
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c

index 8c326a4774cbe55062d835084daad4cd43720825..a43805a7b09e179708d3dcc46b5580ebaa1f9585 100644 (file)
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -37,7 +37,7 @@
  
  static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
  static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
-                              TransactionId latestRemovedXid);
+                              FullTransactionId safexid);
  static void _bt_delitems_delete(Relation rel, Buffer buf,
                                 TransactionId latestRemovedXid,
                                 OffsetNumber *deletable, int ndeletable,
@@ -50,7 +50,6 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
  static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
                                      BlockNumber scanblkno,
                                      bool *rightsib_empty,
-                                    TransactionId *oldestBtpoXact,
                                      uint32 *ndeleted);
  static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
                                     BTStack stack,
@@ -78,7 +77,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
     metad->btm_level = level;
     metad->btm_fastroot = rootbknum;
     metad->btm_fastlevel = level;
-   metad->btm_oldest_btpo_xact = InvalidTransactionId;
+   metad->btm_last_cleanup_num_delpages = 0;
     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     metad->btm_allequalimage = allequalimage;
  
@@ -118,7 +117,7 @@ _bt_upgrademetapage(Page page)
  
     /* Set version number and fill extra fields added into version 3 */
     metad->btm_version = BTREE_NOVAC_VERSION;
-   metad->btm_oldest_btpo_xact = InvalidTransactionId;
+   metad->btm_last_cleanup_num_delpages = 0;
     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     /* Only a REINDEX can set this field */
     Assert(!metad->btm_allequalimage);
@@ -169,35 +168,61 @@ _bt_getmeta(Relation rel, Buffer metabuf)
  }
  
  /*
- * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
- *                                   the metapage.
- *
- *     This routine checks if provided cleanup-related information is matching
- *     to those written in the metapage.  On mismatch, metapage is overwritten.
+ * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup().
+ *
+ *     This routine is called at the end of each VACUUM's btvacuumcleanup()
+ *     call.  Its purpose is to maintain the metapage fields that are used by
+ *     _bt_vacuum_needs_cleanup() to decide whether or not a btvacuumscan()
+ *     call should go ahead for an entire VACUUM operation.
+ *
+ *     See btvacuumcleanup() and _bt_vacuum_needs_cleanup() for details of
+ *     the two fields that we maintain here.
+ *
+ *     The information that we maintain for btvacuumcleanup() describes the
+ *     state of the index (as well as the table it indexes) just _after_ the
+ *     ongoing VACUUM operation.  The next _bt_vacuum_needs_cleanup() call
+ *     will consider the information we saved for it during the next VACUUM
+ *     operation (assuming that there will be no btbulkdelete() call during
+ *     the next VACUUM operation -- if there is then the question of skipping
+ *     btvacuumscan() doesn't even arise).
   */
  void
-_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
-                            float8 numHeapTuples)
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+                    float8 num_heap_tuples)
  {
     Buffer      metabuf;
     Page        metapg;
     BTMetaPageData *metad;
-   bool        needsRewrite = false;
+   bool        rewrite = false;
     XLogRecPtr  recptr;
  
-   /* read the metapage and check if it needs rewrite */
+   /*
+    * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+    * field started out as a TransactionId field called btm_oldest_btpo_xact.
+    * Both "versions" are just uint32 fields.  It was convenient to repurpose
+    * the field when we began to use 64-bit XIDs in deleted pages.
+    *
+    * It's possible that a pg_upgrade'd database will contain an XID value in
+    * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+    * field.  _bt_vacuum_needs_cleanup() may even believe that this value
+    * indicates that there are lots of pages that it needs to recycle, when
+    * in reality there are only one or two.  The worst that can happen is
+    * that there will be a call to btvacuumscan a little earlier, which will
+    * set btm_last_cleanup_num_delpages to a sane value when we're called.
+    */
     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     metapg = BufferGetPage(metabuf);
     metad = BTPageGetMeta(metapg);
  
-   /* outdated version of metapage always needs rewrite */
+   /* Always dynamically upgrade index/metapage when BTREE_MIN_VERSION */
     if (metad->btm_version < BTREE_NOVAC_VERSION)
-       needsRewrite = true;
-   else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
-            metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
-       needsRewrite = true;
+       rewrite = true;
+   else if (metad->btm_last_cleanup_num_delpages != num_delpages)
+       rewrite = true;
+   else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples)
+       rewrite = true;
  
-   if (!needsRewrite)
+   if (!rewrite)
     {
         _bt_relbuf(rel, metabuf);
         return;
@@ -214,8 +239,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
         _bt_upgrademetapage(metapg);
  
     /* update cleanup-related information */
-   metad->btm_oldest_btpo_xact = oldestBtpoXact;
-   metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+   metad->btm_last_cleanup_num_delpages = num_delpages;
+   metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples;
     MarkBufferDirty(metabuf);
  
     /* write wal record if needed */
@@ -232,8 +257,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
         md.level = metad->btm_level;
         md.fastroot = metad->btm_fastroot;
         md.fastlevel = metad->btm_fastlevel;
-       md.oldest_btpo_xact = oldestBtpoXact;
-       md.last_cleanup_num_heap_tuples = numHeapTuples;
+       md.last_cleanup_num_delpages = num_delpages;
+       md.last_cleanup_num_heap_tuples = num_heap_tuples;
         md.allequalimage = metad->btm_allequalimage;
  
         XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
@@ -244,6 +269,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
     }
  
     END_CRIT_SECTION();
+
     _bt_relbuf(rel, metabuf);
  }
  
@@ -316,7 +342,7 @@ _bt_getroot(Relation rel, int access)
          * because that's not set in a "fast root".
          */
         if (!P_IGNORE(rootopaque) &&
-           rootopaque->btpo.level == rootlevel &&
+           rootopaque->btpo_level == rootlevel &&
             P_LEFTMOST(rootopaque) &&
             P_RIGHTMOST(rootopaque))
         {
@@ -377,7 +403,7 @@ _bt_getroot(Relation rel, int access)
         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
-       rootopaque->btpo.level = 0;
+       rootopaque->btpo_level = 0;
         rootopaque->btpo_cycleid = 0;
         /* Get raw page pointer for metapage */
         metapg = BufferGetPage(metabuf);
@@ -393,7 +419,7 @@ _bt_getroot(Relation rel, int access)
         metad->btm_level = 0;
         metad->btm_fastroot = rootblkno;
         metad->btm_fastlevel = 0;
-       metad->btm_oldest_btpo_xact = InvalidTransactionId;
+       metad->btm_last_cleanup_num_delpages = 0;
         metad->btm_last_cleanup_num_heap_tuples = -1.0;
  
         MarkBufferDirty(rootbuf);
@@ -416,7 +442,7 @@ _bt_getroot(Relation rel, int access)
             md.level = 0;
             md.fastroot = rootblkno;
             md.fastlevel = 0;
-           md.oldest_btpo_xact = InvalidTransactionId;
+           md.last_cleanup_num_delpages = 0;
             md.last_cleanup_num_heap_tuples = -1.0;
             md.allequalimage = metad->btm_allequalimage;
  
@@ -481,11 +507,10 @@ _bt_getroot(Relation rel, int access)
             rootblkno = rootopaque->btpo_next;
         }
  
-       /* Note: can't check btpo.level on deleted pages */
-       if (rootopaque->btpo.level != rootlevel)
+       if (rootopaque->btpo_level != rootlevel)
             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
                  rootblkno, RelationGetRelationName(rel),
-                rootopaque->btpo.level, rootlevel);
+                rootopaque->btpo_level, rootlevel);
     }
  
     /*
@@ -585,11 +610,10 @@ _bt_gettrueroot(Relation rel)
         rootblkno = rootopaque->btpo_next;
     }
  
-   /* Note: can't check btpo.level on deleted pages */
-   if (rootopaque->btpo.level != rootlevel)
+   if (rootopaque->btpo_level != rootlevel)
         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
              rootblkno, RelationGetRelationName(rel),
-            rootopaque->btpo.level, rootlevel);
+            rootopaque->btpo_level, rootlevel);
  
     return rootbuf;
  }
@@ -762,7 +786,7 @@ _bt_checkpage(Relation rel, Buffer buf)
   * Log the reuse of a page from the FSM.
   */
  static void
-_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
  {
     xl_btree_reuse_page xlrec_reuse;
  
@@ -775,7 +799,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
     /* XLOG stuff */
     xlrec_reuse.node = rel->rd_node;
     xlrec_reuse.block = blkno;
-   xlrec_reuse.latestRemovedXid = latestRemovedXid;
+   xlrec_reuse.latestRemovedFullXid = safexid;
  
     XLogBeginInsert();
     XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
@@ -856,26 +880,34 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
             if (_bt_conditionallockbuf(rel, buf))
             {
                 page = BufferGetPage(buf);
-               if (_bt_page_recyclable(page))
+
+               /*
+                * It's possible to find an all-zeroes page in an index.  For
+                * example, a backend might successfully extend the relation
+                * one page and then crash before it is able to make a WAL
+                * entry for adding the page.  If we find a zeroed page then
+                * reclaim it immediately.
+                */
+               if (PageIsNew(page))
+               {
+                   /* Okay to use page.  Initialize and return it. */
+                   _bt_pageinit(page, BufferGetPageSize(buf));
+                   return buf;
+               }
+
+               if (BTPageIsRecyclable(page))
                 {
                     /*
                      * If we are generating WAL for Hot Standby then create a
                      * WAL record that will allow us to conflict with queries
                      * running on standby, in case they have snapshots older
-                    * than btpo.xact.  This can only apply if the page does
-                    * have a valid btpo.xact value, ie not if it's new.  (We
-                    * must check that because an all-zero page has no special
-                    * space.)
+                    * than safexid value
                      */
-                   if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
-                       !PageIsNew(page))
-                   {
-                       BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-                       _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
-                   }
+                   if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+                       _bt_log_reuse_page(rel, blkno,
+                                          BTPageGetDeleteXid(page));
  
-                   /* Okay to use page.  Re-initialize and return it */
+                   /* Okay to use page.  Re-initialize and return it. */
                     _bt_pageinit(page, BufferGetPageSize(buf));
                     return buf;
                 }
@@ -1073,40 +1105,6 @@ _bt_pageinit(Page page, Size size)
     PageInit(page, size, sizeof(BTPageOpaqueData));
  }
  
-/*
- * _bt_page_recyclable() -- Is an existing page recyclable?
- *
- * This exists to make sure _bt_getbuf and btvacuumscan have the same
- * policy about whether a page is safe to re-use.  But note that _bt_getbuf
- * knows enough to distinguish the PageIsNew condition from the other one.
- * At some point it might be appropriate to redesign this to have a three-way
- * result value.
- */
-bool
-_bt_page_recyclable(Page page)
-{
-   BTPageOpaque opaque;
-
-   /*
-    * It's possible to find an all-zeroes page in an index --- for example, a
-    * backend might successfully extend the relation one page and then crash
-    * before it is able to make a WAL entry for adding the page. If we find a
-    * zeroed page then reclaim it.
-    */
-   if (PageIsNew(page))
-       return true;
-
-   /*
-    * Otherwise, recycle if deleted and too old to have any processes
-    * interested in it.
-    */
-   opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-   if (P_ISDELETED(opaque) &&
-       GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
-       return true;
-   return false;
-}
-
  /*
   * Delete item(s) from a btree leaf page during VACUUM.
   *
@@ -1768,16 +1766,12 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
   * that the btvacuumscan scan has yet to reach; they'll get counted later
   * instead.
   *
- * Maintains *oldestBtpoXact for any pages that get deleted.  Caller is
- * responsible for maintaining *oldestBtpoXact in the case of pages that were
- * deleted by a previous VACUUM.
- *
   * NOTE: this leaks memory.  Rather than trying to clean up everything
   * carefully, it's better to run it in a temp context that can be reset
   * frequently.
   */
  uint32
-_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
+_bt_pagedel(Relation rel, Buffer leafbuf)
  {
     uint32      ndeleted = 0;
     BlockNumber rightsib;
@@ -1985,8 +1979,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
         {
             /* Check for interrupts in _bt_unlink_halfdead_page */
             if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
-                                         &rightsib_empty, oldestBtpoXact,
-                                         &ndeleted))
+                                         &rightsib_empty, &ndeleted))
             {
                 /*
                  * _bt_unlink_halfdead_page should never fail, since we
@@ -2002,8 +1995,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
         }
  
         Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
-       Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
-                                           *oldestBtpoXact));
  
         rightsib = opaque->btpo_next;
  
@@ -2264,12 +2255,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
   * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
   * consistent.)
   *
- * We maintain *oldestBtpoXact for pages that are deleted by the current
- * VACUUM operation here.  This must be handled here because we conservatively
- * assume that there needs to be a new call to ReadNextTransactionId() each
- * time a page gets deleted.  See comments about the underlying assumption
- * below.
- *
   * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
   * On success exit, we'll be holding pin and write lock.  On failure exit,
   * we'll release both pin and lock before returning (we define it that way
@@ -2277,8 +2262,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
   */
  static bool
  _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
-                        bool *rightsib_empty, TransactionId *oldestBtpoXact,
-                        uint32 *ndeleted)
+                        bool *rightsib_empty, uint32 *ndeleted)
  {
     BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
     BlockNumber leafleftsib;
@@ -2294,12 +2278,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
     BTMetaPageData *metad = NULL;
     ItemId      itemid;
     Page        page;
-   PageHeader  header;
     BTPageOpaque opaque;
+   FullTransactionId safexid;
     bool        rightsib_is_rightmost;
-   int         targetlevel;
+   uint32      targetlevel;
     IndexTuple  leafhikey;
-   BlockNumber nextchild;
+   BlockNumber leaftopparent;
  
     page = BufferGetPage(leafbuf);
     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -2343,7 +2327,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
         page = BufferGetPage(buf);
         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
         leftsib = opaque->btpo_prev;
-       targetlevel = opaque->btpo.level;
+       targetlevel = opaque->btpo_level;
         Assert(targetlevel > 0);
  
         /*
@@ -2450,20 +2434,26 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
             !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
                  target, RelationGetRelationName(rel));
-       nextchild = InvalidBlockNumber;
+
+       /* Leaf page is also target page: don't set leaftopparent */
+       leaftopparent = InvalidBlockNumber;
     }
     else
     {
+       IndexTuple  finaldataitem;
+
         if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
             P_ISLEAF(opaque))
             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
                  target, RelationGetRelationName(rel));
  
-       /* Remember the next non-leaf child down in the subtree */
+       /* Target is internal: set leaftopparent for next call here...  */
         itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
-       nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
-       if (nextchild == leafblkno)
-           nextchild = InvalidBlockNumber;
+       finaldataitem = (IndexTuple) PageGetItem(page, itemid);
+       leaftopparent = BTreeTupleGetDownLink(finaldataitem);
+       /* ...except when it would be a redundant pointer-to-self */
+       if (leaftopparent == leafblkno)
+           leaftopparent = InvalidBlockNumber;
     }
  
     /*
@@ -2553,13 +2543,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
      * no lock was held.
      */
     if (target != leafblkno)
-       BTreeTupleSetTopParent(leafhikey, nextchild);
+       BTreeTupleSetTopParent(leafhikey, leaftopparent);
  
     /*
      * Mark the page itself deleted.  It can be recycled when all current
      * transactions are gone.  Storing GetTopTransactionId() would work, but
      * we're in VACUUM and would not otherwise have an XID.  Having already
-    * updated links to the target, ReadNextTransactionId() suffices as an
+    * updated links to the target, ReadNextFullTransactionId() suffices as an
      * upper bound.  Any scan having retained a now-stale link is advertising
      * in its PGPROC an xmin less than or equal to the value we read here.  It
      * will continue to do so, holding back the xmin horizon, for the duration
@@ -2568,17 +2558,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
     page = BufferGetPage(buf);
     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
-   opaque->btpo_flags &= ~BTP_HALF_DEAD;
-   opaque->btpo_flags |= BTP_DELETED;
-   opaque->btpo.xact = ReadNextTransactionId();
  
     /*
-    * Remove the remaining tuples on the page.  This keeps things simple for
-    * WAL consistency checking.
+    * Store upper bound XID that's used to determine when deleted page is no
+    * longer needed as a tombstone
      */
-   header = (PageHeader) page;
-   header->pd_lower = SizeOfPageHeaderData;
-   header->pd_upper = header->pd_special;
+   safexid = ReadNextFullTransactionId();
+   BTPageSetDeleted(page, safexid);
+   opaque->btpo_cycleid = 0;
  
     /* And update the metapage, if needed */
     if (BufferIsValid(metabuf))
@@ -2616,15 +2603,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
         if (target != leafblkno)
             XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
  
-       /* information on the unlinked block */
+       /* information stored on the target/to-be-unlinked block */
         xlrec.leftsib = leftsib;
         xlrec.rightsib = rightsib;
-       xlrec.btpo_xact = opaque->btpo.xact;
+       xlrec.level = targetlevel;
+       xlrec.safexid = safexid;
  
         /* information needed to recreate the leaf block (if not the target) */
         xlrec.leafleftsib = leafleftsib;
         xlrec.leafrightsib = leafrightsib;
-       xlrec.topparent = nextchild;
+       xlrec.leaftopparent = leaftopparent;
  
         XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
  
@@ -2638,7 +2626,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
             xlmeta.level = metad->btm_level;
             xlmeta.fastroot = metad->btm_fastroot;
             xlmeta.fastlevel = metad->btm_fastlevel;
-           xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+           xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
             xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
             xlmeta.allequalimage = metad->btm_allequalimage;
  
@@ -2681,9 +2669,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
         _bt_relbuf(rel, lbuf);
     _bt_relbuf(rel, rbuf);
  
-   if (!TransactionIdIsValid(*oldestBtpoXact) ||
-       TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
-       *oldestBtpoXact = opaque->btpo.xact;
+   /* If the target is not leafbuf, we're done with it now -- release it */
+   if (target != leafblkno)
+       _bt_relbuf(rel, buf);
  
     /*
      * If btvacuumscan won't revisit this page in a future btvacuumpage call
@@ -2693,10 +2681,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
     if (target <= scanblkno)
         (*ndeleted)++;
  
-   /* If the target is not leafbuf, we're done with it now -- release it */
-   if (target != leafblkno)
-       _bt_relbuf(rel, buf);
-
     return true;
  }
  
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 289bd3c15daa00d685b5704302e10558992e1929..3b2e0aa5cb794c9a1dbd2b8341cc8f04c9e5b3aa 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -46,8 +46,6 @@ typedef struct
     IndexBulkDeleteCallback callback;
     void       *callback_state;
     BTCycleId   cycleid;
-   BlockNumber totFreePages;   /* true total # of free pages */
-   TransactionId oldestBtpoXact;
     MemoryContext pagedelcontext;
  } BTVacState;
  
@@ -790,7 +788,7 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
   * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
   *
   * Called by btvacuumcleanup when btbulkdelete was never called because no
- * tuples need to be deleted.
+ * tuples needed to be deleted by VACUUM.
   *
   * When we return false, VACUUM can even skip the cleanup-only call to
   * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
@@ -802,66 +800,75 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
     Buffer      metabuf;
     Page        metapg;
     BTMetaPageData *metad;
-   bool        result = false;
+   BTOptions  *relopts;
+   float8      cleanup_scale_factor;
+   uint32      btm_version;
+   BlockNumber prev_num_delpages;
+   float8      prev_num_heap_tuples;
  
+   /*
+    * Copy details from metapage to local variables quickly.
+    *
+    * Note that we deliberately avoid using cached version of metapage here.
+    */
     metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
     metapg = BufferGetPage(metabuf);
     metad = BTPageGetMeta(metapg);
+   btm_version = metad->btm_version;
  
-   /*
-    * XXX: If IndexVacuumInfo contained the heap relation, we could be more
-    * aggressive about vacuuming non catalog relations by passing the table
-    * to GlobalVisCheckRemovableXid().
-    */
-
-   if (metad->btm_version < BTREE_NOVAC_VERSION)
+   if (btm_version < BTREE_NOVAC_VERSION)
     {
         /*
-        * Do cleanup if metapage needs upgrade, because we don't have
-        * cleanup-related meta-information yet.
+        * Metapage needs to be dynamically upgraded to store fields that are
+        * only present when btm_version >= BTREE_NOVAC_VERSION
          */
-       result = true;
-   }
-   else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
-            GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact))
-   {
-       /*
-        * If any oldest btpo.xact from a previously deleted page in the index
-        * is visible to everyone, then at least one deleted page can be
-        * recycled -- don't skip cleanup.
-        */
-       result = true;
-   }
-   else
-   {
-       BTOptions  *relopts;
-       float8      cleanup_scale_factor;
-       float8      prev_num_heap_tuples;
-
-       /*
-        * If table receives enough insertions and no cleanup was performed,
-        * then index would appear have stale statistics.  If scale factor is
-        * set, we avoid that by performing cleanup if the number of inserted
-        * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
-        * original tuples count.
-        */
-       relopts = (BTOptions *) info->index->rd_options;
-       cleanup_scale_factor = (relopts &&
-                               relopts->vacuum_cleanup_index_scale_factor >= 0)
-           ? relopts->vacuum_cleanup_index_scale_factor
-           : vacuum_cleanup_index_scale_factor;
-       prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
-
-       if (cleanup_scale_factor <= 0 ||
-           info->num_heap_tuples < 0 ||
-           prev_num_heap_tuples <= 0 ||
-           (info->num_heap_tuples - prev_num_heap_tuples) /
-           prev_num_heap_tuples >= cleanup_scale_factor)
-           result = true;
+       _bt_relbuf(info->index, metabuf);
+       return true;
     }
  
+   prev_num_delpages = metad->btm_last_cleanup_num_delpages;
+   prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
     _bt_relbuf(info->index, metabuf);
-   return result;
+
+   /*
+    * If the underlying table has received a sufficiently high number of
+    * insertions since the last VACUUM operation that called btvacuumscan(),
+    * then have the current VACUUM operation call btvacuumscan() now.  This
+    * happens when the statistics are deemed stale.
+    *
+    * XXX: We should have a more principled way of determining what
+    * "staleness" means. The  vacuum_cleanup_index_scale_factor GUC (and the
+    * index-level storage param) seem hard to tune in a principled way.
+    */
+   relopts = (BTOptions *) info->index->rd_options;
+   cleanup_scale_factor = (relopts &&
+                           relopts->vacuum_cleanup_index_scale_factor >= 0)
+       ? relopts->vacuum_cleanup_index_scale_factor
+       : vacuum_cleanup_index_scale_factor;
+
+   if (cleanup_scale_factor <= 0 ||
+       info->num_heap_tuples < 0 ||
+       prev_num_heap_tuples <= 0 ||
+       (info->num_heap_tuples - prev_num_heap_tuples) /
+       prev_num_heap_tuples >= cleanup_scale_factor)
+       return true;
+
+   /*
+    * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
+    * total size of the index.  We can reasonably expect (though are not
+    * guaranteed) to be able to recycle this many pages if we decide to do a
+    * btvacuumscan call during the ongoing btvacuumcleanup.
+    *
+    * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan
+    * calls.  That is, we can end up scanning the entire index without ever
+    * placing even 1 of the prev_num_delpages pages in the free space map, at
+    * least in certain narrow cases (see nbtree/README section on recycling
+    * deleted pages for details).  This rarely matters in practice.
+    */
+   if (prev_num_delpages > RelationGetNumberOfBlocks(info->index) / 20)
+       return true;
+
+   return false;
  }
  
  /*
@@ -904,30 +911,62 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  IndexBulkDeleteResult *
  btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  {
+   BlockNumber num_delpages;
+
     /* No-op in ANALYZE ONLY mode */
     if (info->analyze_only)
         return stats;
  
     /*
-    * If btbulkdelete was called, we need not do anything, just return the
-    * stats from the latest btbulkdelete call.  If it wasn't called, we might
-    * still need to do a pass over the index, to recycle any newly-recyclable
-    * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
-    * determines if either are needed.
+    * If btbulkdelete was called, we need not do anything (we just maintain
+    * the information used within _bt_vacuum_needs_cleanup() by calling
+    * _bt_set_cleanup_info() below).
      *
-    * Since we aren't going to actually delete any leaf items, there's no
-    * need to go through all the vacuum-cycle-ID pushups.
+    * If btbulkdelete was _not_ called, then we have a choice to make: we
+    * must decide whether or not a btvacuumscan() call is needed now (i.e.
+    * whether the ongoing VACUUM operation can entirely avoid a physical scan
+    * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
+    * now.
      */
     if (stats == NULL)
     {
-       /* Check if we need a cleanup */
+       /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
         if (!_bt_vacuum_needs_cleanup(info))
             return NULL;
  
+       /*
+        * Since we aren't going to actually delete any leaf items, there's no
+        * need to go through all the vacuum-cycle-ID pushups here
+        */
         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
         btvacuumscan(info, stats, NULL, NULL, 0);
     }
  
+   /*
+    * By here, we know for sure that this VACUUM operation won't be skipping
+    * its btvacuumscan() call.  Maintain the count of the current number of
+    * heap tuples in the metapage.  Also maintain the num_delpages value.
+    * This information will be used by _bt_vacuum_needs_cleanup() during
+    * future VACUUM operations that don't need to call btbulkdelete().
+    *
+    * num_delpages is the number of deleted pages now in the index that were
+    * not safe to place in the FSM to be recycled just yet.  We expect that
+    * it will almost certainly be possible to place all of these pages in the
+    * FSM during the next VACUUM operation.  That factor alone might cause
+    * _bt_vacuum_needs_cleanup() to force the next VACUUM to proceed with a
+    * btvacuumscan() call.
+    *
+    * Note: We must delay the _bt_set_cleanup_info() call until this late
+    * stage of VACUUM (the btvacuumcleanup() phase), to keep num_heap_tuples
+    * accurate.  The btbulkdelete()-time num_heap_tuples value is generally
+    * just pg_class.reltuples for the heap relation _before_ VACUUM began.
+    * In general cleanup info should describe the state of the index/table
+    * _after_ VACUUM finishes.
+    */
+   Assert(stats->pages_deleted >= stats->pages_free);
+   num_delpages = stats->pages_deleted - stats->pages_free;
+   _bt_set_cleanup_info(info->index, num_delpages, info->num_heap_tuples);
+
     /*
      * It's quite possible for us to be fooled by concurrent page splits into
      * double-counting some index tuples, so disbelieve any total that exceeds
@@ -957,8 +996,6 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
   * deleted, and looking for old deleted pages that can be recycled.  Both
   * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
   * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
- * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
- * is maintained.
   *
   * The caller is responsible for initially allocating/zeroing a stats struct
   * and for obtaining a vacuum cycle ID if necessary.
@@ -975,12 +1012,25 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     bool        needLock;
  
     /*
-    * Reset counts that will be incremented during the scan; needed in case
-    * of multiple scans during a single VACUUM command
+    * Reset fields that track information about the entire index now.  This
+    * avoids double-counting in the case where a single VACUUM command
+    * requires multiple scans of the index.
+    *
+    * Avoid resetting the tuples_removed field here, since it tracks
+    * information about the VACUUM command, and so must last across each call
+    * to btvacuumscan().
+    *
+    * (Note that pages_free is treated as state about the whole index, not
+    * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
+    * calls are idempotent, and get repeated for the same deleted pages in
+    * some scenarios.  The point for us is to track the number of recyclable
+    * pages in the index at the end of the VACUUM command.)
      */
+   stats->num_pages = 0;
     stats->estimated_count = false;
     stats->num_index_tuples = 0;
     stats->pages_deleted = 0;
+   stats->pages_free = 0;
  
     /* Set up info to pass down to btvacuumpage */
     vstate.info = info;
@@ -988,8 +1038,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     vstate.callback = callback;
     vstate.callback_state = callback_state;
     vstate.cycleid = cycleid;
-   vstate.totFreePages = 0;
-   vstate.oldestBtpoXact = InvalidTransactionId;
  
     /* Create a temporary memory context to run _bt_pagedel in */
     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@@ -1048,6 +1096,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
         }
     }
  
+   /* Set statistics num_pages field to final size of index */
+   stats->num_pages = num_pages;
+
     MemoryContextDelete(vstate.pagedelcontext);
  
     /*
@@ -1062,27 +1113,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
      * Note that if no recyclable pages exist, we don't bother vacuuming the
      * FSM at all.
      */
-   if (vstate.totFreePages > 0)
+   if (stats->pages_free > 0)
         IndexFreeSpaceMapVacuum(rel);
-
-   /*
-    * Maintain the oldest btpo.xact and a count of the current number of heap
-    * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
-    *
-    * The page with the oldest btpo.xact is typically a page deleted by this
-    * VACUUM operation, since pages deleted by a previous VACUUM operation
-    * tend to be placed in the FSM (by the current VACUUM operation) -- such
-    * pages are not candidates to be the oldest btpo.xact.  (Note that pages
-    * placed in the FSM are reported as deleted pages in the bulk delete
-    * statistics, despite not counting as deleted pages for the purposes of
-    * determining the oldest btpo.xact.)
-    */
-   _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
-                                info->num_heap_tuples);
-
-   /* update statistics */
-   stats->num_pages = num_pages;
-   stats->pages_free = vstate.totFreePages;
  }
  
  /*
@@ -1188,13 +1220,12 @@ backtrack:
         }
     }
  
-   /* Page is valid, see what to do with it */
-   if (_bt_page_recyclable(page))
+   if (!opaque || BTPageIsRecyclable(page))
     {
         /* Okay to recycle this page (which could be leaf or internal) */
         RecordFreeIndexPage(rel, blkno);
-       vstate->totFreePages++;
         stats->pages_deleted++;
+       stats->pages_free++;
     }
     else if (P_ISDELETED(opaque))
     {
@@ -1203,17 +1234,12 @@ backtrack:
          * recycle yet.
          */
         stats->pages_deleted++;
-
-       /* Maintain the oldest btpo.xact */
-       if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
-           TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
-           vstate->oldestBtpoXact = opaque->btpo.xact;
     }
     else if (P_ISHALFDEAD(opaque))
     {
         /*
          * Half-dead leaf page.  Try to delete now.  Might update
-        * oldestBtpoXact and pages_deleted below.
+        * pages_deleted below.
          */
         attempt_pagedel = true;
     }
@@ -1430,7 +1456,7 @@ backtrack:
          * count.  There will be no double-counting.
          */
         Assert(blkno == scanblkno);
-       stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
+       stats->pages_deleted += _bt_pagedel(rel, buf);
  
         MemoryContextSwitchTo(oldcontext);
         /* pagedel released buffer, so we shouldn't */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index 2e3bda8171d7775dd2f5c7cdb0660fd4386bce55..d1177d8772cec4521f03440d31e095151650bd37 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
          * we're on the level 1 and asked to lock leaf page in write mode,
          * then lock next page in write mode, because it must be a leaf.
          */
-       if (opaque->btpo.level == 1 && access == BT_WRITE)
+       if (opaque->btpo_level == 1 && access == BT_WRITE)
             page_access = BT_WRITE;
  
         /* drop the read lock on the page, then acquire one on its child */
@@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
         }
  
         /* Done? */
-       if (opaque->btpo.level == level)
+       if (opaque->btpo_level == level)
             break;
-       if (opaque->btpo.level < level)
+       if (opaque->btpo_level < level)
             ereport(ERROR,
                     (errcode(ERRCODE_INDEX_CORRUPTED),
                      errmsg_internal("btree level %u not found in index \"%s\"",
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index 5683daa34d3b7ac139623c5692f5238831162b8c..2c4d7f6e25a7c91d534a5bb74f013c21ee55ae99 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level)
     /* Initialize BT opaque state */
     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     opaque->btpo_prev = opaque->btpo_next = P_NONE;
-   opaque->btpo.level = level;
+   opaque->btpo_level = level;
     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     opaque->btpo_cycleid = 0;
  
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c

index c1d578cc01609fd384e548c83737ba992b9ff2d1..8b7c143db485674d6c8a9e6cde5a0cfd33edd2c4 100644 (file)
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -112,7 +112,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
     md->btm_fastlevel = xlrec->fastlevel;
     /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
     Assert(md->btm_version >= BTREE_NOVAC_VERSION);
-   md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+   md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
     md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
     md->btm_allequalimage = xlrec->allequalimage;
  
@@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
  
     ropaque->btpo_prev = origpagenumber;
     ropaque->btpo_next = spagenumber;
-   ropaque->btpo.level = xlrec->level;
+   ropaque->btpo_level = xlrec->level;
     ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
     ropaque->btpo_cycleid = 0;
  
@@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
  
     pageop->btpo_prev = xlrec->leftblk;
     pageop->btpo_next = xlrec->rightblk;
-   pageop->btpo.level = 0;
+   pageop->btpo_level = 0;
     pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
     pageop->btpo_cycleid = 0;
  
@@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
     xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
     BlockNumber leftsib;
     BlockNumber rightsib;
+   uint32      level;
+   bool        isleaf;
+   FullTransactionId safexid;
     Buffer      leftbuf;
     Buffer      target;
     Buffer      rightbuf;
@@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
  
     leftsib = xlrec->leftsib;
     rightsib = xlrec->rightsib;
+   level = xlrec->level;
+   isleaf = (level == 0);
+   safexid = xlrec->safexid;
+
+   /* No leaftopparent for level 0 (leaf page) or level 1 target */
+   Assert(xlrec->leaftopparent == InvalidBlockNumber || level > 1);
  
     /*
      * In normal operation, we would lock all the pages this WAL record
@@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
  
     pageop->btpo_prev = leftsib;
     pageop->btpo_next = rightsib;
-   pageop->btpo.xact = xlrec->btpo_xact;
-   pageop->btpo_flags = BTP_DELETED;
-   if (!BlockNumberIsValid(xlrec->topparent))
+   pageop->btpo_level = level;
+   BTPageSetDeleted(page, safexid);
+   if (isleaf)
         pageop->btpo_flags |= BTP_LEAF;
     pageop->btpo_cycleid = 0;
  
@@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
         Buffer              leafbuf;
         IndexTupleData      trunctuple;
  
+       Assert(!isleaf);
+
         leafbuf = XLogInitBufferForRedo(record, 3);
         page = (Page) BufferGetPage(leafbuf);
  
@@ -901,13 +912,13 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
         pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
         pageop->btpo_prev = xlrec->leafleftsib;
         pageop->btpo_next = xlrec->leafrightsib;
-       pageop->btpo.level = 0;
+       pageop->btpo_level = 0;
         pageop->btpo_cycleid = 0;
  
         /* Add a dummy hikey item */
         MemSet(&trunctuple, 0, sizeof(IndexTupleData));
         trunctuple.t_info = sizeof(IndexTupleData);
-       BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
+       BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);
  
         if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
                         false, false) == InvalidOffsetNumber)
@@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record)
  
     pageop->btpo_flags = BTP_ROOT;
     pageop->btpo_prev = pageop->btpo_next = P_NONE;
-   pageop->btpo.level = xlrec->level;
+   pageop->btpo_level = xlrec->level;
     if (xlrec->level == 0)
         pageop->btpo_flags |= BTP_LEAF;
     pageop->btpo_cycleid = 0;
@@ -963,26 +974,40 @@ btree_xlog_newroot(XLogReaderState *record)
     _bt_restore_meta(record, 2);
  }
  
+/*
+ * In general VACUUM must defer recycling as a way of avoiding certain race
+ * conditions.  Deleted pages contain a safexid value that is used by VACUUM
+ * to determine whether or not it's safe to place a page that was deleted by
+ * VACUUM earlier into the FSM now.  See nbtree/README.
+ *
+ * As far as any backend operating during original execution is concerned, the
+ * FSM is a cache of recycle-safe pages; the mere presence of the page in the
+ * FSM indicates that the page must already be safe to recycle (actually,
+ * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just
+ * because it would be unwise to completely trust the FSM, given its current
+ * limitations).
+ *
+ * This isn't sufficient to prevent similar concurrent recycling race
+ * conditions during Hot Standby, though.  For that we need to log a
+ * xl_btree_reuse_page record at the point that a page is actually recycled
+ * and reused for an entirely unrelated page inside _bt_split().  These
+ * records include the same safexid value from the original deleted page,
+ * stored in the record's latestRemovedFullXid field.
+ *
+ * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
+ * to determine if it's safe to recycle a page.  This mirrors our own test:
+ * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on primary
+ * and standby.
+ */
  static void
  btree_xlog_reuse_page(XLogReaderState *record)
  {
     xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
  
-   /*
-    * Btree reuse_page records exist to provide a conflict point when we
-    * reuse pages in the index via the FSM.  That's all they do though.
-    *
-    * latestRemovedXid was the page's btpo.xact.  The
-    * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually
-    * mirrors the pgxact->xmin > limitXmin test in
-    * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
-    * same exclusion effect on primary and standby.
-    */
     if (InHotStandby)
-   {
-       ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
-                                           xlrec->node);
-   }
+       ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+                                                  xlrec->node);
  }
  
  void
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c

index 6e0d6a2b729e84e041af6c04585f5cdbfa675914..f7cc4dd3e6ded743620a00ed8fabc55688a81b5b 100644 (file)
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -80,12 +80,13 @@ btree_desc(StringInfo buf, XLogReaderState *record)
             {
                 xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;
  
-               appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ",
-                                xlrec->leftsib, xlrec->rightsib,
-                                xlrec->btpo_xact);
-               appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u",
+               appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ",
+                                xlrec->leftsib, xlrec->rightsib, xlrec->level,
+                                EpochFromFullTransactionId(xlrec->safexid),
+                                XidFromFullTransactionId(xlrec->safexid));
+               appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u",
                                  xlrec->leafleftsib, xlrec->leafrightsib,
-                                xlrec->topparent);
+                                xlrec->leaftopparent);
                 break;
             }
         case XLOG_BTREE_NEWROOT:
@@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record)
             {
                 xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
  
-               appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u",
+               appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
                                  xlrec->node.spcNode, xlrec->node.dbNode,
-                                xlrec->node.relNode, xlrec->latestRemovedXid);
+                                xlrec->node.relNode,
+                                EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
+                                XidFromFullTransactionId(xlrec->latestRemovedFullXid));
                 break;
             }
         case XLOG_BTREE_META_CLEANUP:
@@ -110,8 +113,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)
  
                 xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0,
                                                                   NULL);
-               appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f",
-                                xlrec->oldest_btpo_xact,
+               appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f",
+                                xlrec->last_cleanup_num_delpages,
                                  xlrec->last_cleanup_num_heap_tuples);
                 break;
             }
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c

index a3ee652030c64abb713edd719b5ada9c0a1c7cd9..17de5a6d0ed7a9cda7de0c6039b5cd6186f56826 100644 (file)
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode
                                            true);
  }
  
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+                                          RelFileNode node)
+{
+   /*
+    * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+    * so truncate the logged FullTransactionId.  If the logged value is very
+    * old, so that XID wrap-around already happened on it, there can't be any
+    * snapshots that still see it.
+    */
+   FullTransactionId nextXid = ReadNextFullTransactionId();
+   uint64            diff;
+
+   diff = U64FromFullTransactionId(nextXid) -
+       U64FromFullTransactionId(latestRemovedFullXid);
+   if (diff < MaxTransactionId / 2)
+   {
+       TransactionId latestRemovedXid;
+
+       latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+       ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+   }
+}
+
  void
  ResolveRecoveryConflictWithTablespace(Oid tsid)
  {
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index cad4f2bdeb9ab6667c6e61971d369b1bfd5f3ec6..9ac90d7439836aeae45eab93b7c0657e40b926cf 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -37,8 +37,9 @@ typedef uint16 BTCycleId;
   *
   * In addition, we store the page's btree level (counting upwards from
   * zero at a leaf page) as well as some flag bits indicating the page type
- * and status.  If the page is deleted, we replace the level with the
- * next-transaction-ID value indicating when it is safe to reclaim the page.
+ * and status.  If the page is deleted, a BTDeletedPageData struct is stored
+ * in the page's tuple area, while a standard BTPageOpaqueData struct is
+ * stored in the page special area.
   *
   * We also store a "vacuum cycle ID".  When a page is split while VACUUM is
   * processing the index, a nonzero value associated with the VACUUM run is
@@ -52,17 +53,17 @@ typedef uint16 BTCycleId;
   *
   * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
   * instead.
+ *
+ * NOTE: the btpo_level field used to be a union type in order to allow
+ * deleted pages to store a 32-bit safexid in the same field.  We now store
+ * 64-bit/full safexid values using BTDeletedPageData instead.
   */
  
  typedef struct BTPageOpaqueData
  {
     BlockNumber btpo_prev;      /* left sibling, or P_NONE if leftmost */
     BlockNumber btpo_next;      /* right sibling, or P_NONE if rightmost */
-   union
-   {
-       uint32      level;      /* tree level --- zero for leaf pages */
-       TransactionId xact;     /* next transaction ID, if deleted */
-   }           btpo;
+   uint32      btpo_level;     /* tree level --- zero for leaf pages */
     uint16      btpo_flags;     /* flag bits, see below */
     BTCycleId   btpo_cycleid;   /* vacuum cycle ID of latest split */
  } BTPageOpaqueData;
@@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
  #define BTP_SPLIT_END  (1 << 5)    /* rightmost page of split group */
  #define BTP_HAS_GARBAGE (1 << 6)   /* page has LP_DEAD tuples (deprecated) */
  #define BTP_INCOMPLETE_SPLIT (1 << 7)  /* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID    (1 << 8)    /* contains BTDeletedPageData */
  
  /*
   * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -105,10 +107,12 @@ typedef struct BTMetaPageData
     BlockNumber btm_fastroot;   /* current "fast" root location */
     uint32      btm_fastlevel;  /* tree level of the "fast" root page */
     /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
-   TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted
-                                        * pages */
-   float8      btm_last_cleanup_num_heap_tuples;   /* number of heap tuples
-                                                    * during last cleanup */
+
+   /* number of deleted, non-recyclable pages during last cleanup */
+   uint32      btm_last_cleanup_num_delpages;
+   /* number of heap tuples during last cleanup */
+   float8      btm_last_cleanup_num_heap_tuples;
+
     bool        btm_allequalimage;  /* are all columns "equalimage"? */
  } BTMetaPageData;
  
@@ -220,6 +224,93 @@ typedef struct BTMetaPageData
  #define P_IGNORE(opaque)       (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
  #define P_HAS_GARBAGE(opaque)  (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
  #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque)  (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageData is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageData
+{
+   FullTransactionId safexid;  /* See BTPageIsRecyclable() */
+} BTDeletedPageData;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+   BTPageOpaque opaque;
+   PageHeader  header;
+   BTDeletedPageData *contents;
+
+   opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+   header = ((PageHeader) page);
+
+   opaque->btpo_flags &= ~BTP_HALF_DEAD;
+   opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+   header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
+       sizeof(BTDeletedPageData);
+   header->pd_upper = header->pd_special;
+
+   /* Set safexid in deleted page */
+   contents = ((BTDeletedPageData *) PageGetContents(page));
+   contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+   BTPageOpaque opaque;
+   BTDeletedPageData *contents;
+
+   /* We only expect to be called with a deleted page */
+   Assert(!PageIsNew(page));
+   opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+   Assert(P_ISDELETED(opaque));
+
+   /* pg_upgrade'd deleted page -- must be safe to delete now */
+   if (!P_HAS_FULLXID(opaque))
+       return FirstNormalFullTransactionId;
+
+   /* Get safexid from deleted page */
+   contents = ((BTDeletedPageData *) PageGetContents(page));
+   return contents->safexid;
+}
+
+/*
+ * Is an existing page recyclable?
+ *
+ * This exists to centralize the policy on which deleted pages are now safe to
+ * re-use.
+ *
+ * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
+ * them here (caller is responsible for that case themselves).  Caller might
+ * well need special handling for new pages anyway.
+ */
+static inline bool
+BTPageIsRecyclable(Page page)
+{
+   BTPageOpaque opaque;
+
+   Assert(!PageIsNew(page));
+
+   /* Recycling okay iff page is deleted and safexid is old enough */
+   opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+   if (P_ISDELETED(opaque))
+   {
+       /*
+        * The page was deleted, but when? If it was just deleted, a scan
+        * might have seen the downlink to it, and will read the page later.
+        * As long as that can happen, we must keep the deleted page around as
+        * a tombstone.
+        *
+        * For that check if the deletion XID could still be visible to
+        * anyone. If not, then no scan that's still in progress could have
+        * seen its downlink, and we can recycle it.
+        */
+       return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+   }
+
+   return false;
+}
  
  /*
   * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -962,7 +1053,7 @@ typedef struct BTOptions
  {
     int32       varlena_header_;    /* varlena header (do not touch directly!) */
     int         fillfactor;     /* page fill factor in percent (0..100) */
-   /* fraction of newly inserted tuples prior to trigger index cleanup */
+   /* fraction of newly inserted tuples needed to trigger index cleanup */
     float8      vacuum_cleanup_index_scale_factor;
     bool        deduplicate_items;  /* Try to deduplicate items? */
  } BTOptions;
@@ -1066,8 +1157,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
   */
  extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
                              bool allequalimage);
-extern void _bt_update_meta_cleanup_info(Relation rel,
-                                        TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+                                float8 num_heap_tuples);
  extern void _bt_upgrademetapage(Page page);
  extern Buffer _bt_getroot(Relation rel, int access);
  extern Buffer _bt_gettrueroot(Relation rel);
@@ -1084,15 +1175,13 @@ extern void _bt_unlockbuf(Relation rel, Buffer buf);
  extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
  extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
  extern void _bt_pageinit(Page page, Size size);
-extern bool _bt_page_recyclable(Page page);
  extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
                                 OffsetNumber *deletable, int ndeletable,
                                 BTVacuumPosting *updatable, int nupdatable);
  extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
                                       Relation heapRel,
                                       TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
-                         TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);
  
  /*
   * prototypes for functions in nbtsearch.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h

index 7ae5c98c2b8234a629fa8533a9c2476c4b1ec1d6..3df34fcda2d5394dcadb9ad6a75c9af7b4e97a50 100644 (file)
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -13,6 +13,7 @@
  #ifndef NBTXLOG_H
  #define NBTXLOG_H
  
+#include "access/transam.h"
  #include "access/xlogreader.h"
  #include "lib/stringinfo.h"
  #include "storage/off.h"
@@ -52,7 +53,7 @@ typedef struct xl_btree_metadata
     uint32      level;
     BlockNumber fastroot;
     uint32      fastlevel;
-   TransactionId oldest_btpo_xact;
+   uint32      last_cleanup_num_delpages;
     float8      last_cleanup_num_heap_tuples;
     bool        allequalimage;
  } xl_btree_metadata;
@@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page
  {
     RelFileNode node;
     BlockNumber block;
-   TransactionId latestRemovedXid;
+   FullTransactionId latestRemovedFullXid;
  } xl_btree_reuse_page;
  
  #define SizeOfBtreeReusePage   (sizeof(xl_btree_reuse_page))
@@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead
  #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
  
  /*
- * This is what we need to know about deletion of a btree page.  Note we do
- * not store any content for the deleted page --- it is just rewritten as empty
- * during recovery, apart from resetting the btpo.xact.
+ * This is what we need to know about deletion of a btree page.  Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while).  It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
   *
   * Backup Blk 0: target block being deleted
   * Backup Blk 1: target block's left sibling, if any
@@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page
  {
     BlockNumber leftsib;        /* target block's left sibling, if any */
     BlockNumber rightsib;       /* target block's right sibling */
+   uint32      level;          /* target block's level */
+   FullTransactionId safexid;  /* target block's BTPageSetDeleted() XID */
  
     /*
-    * Information needed to recreate the leaf page, when target is an
-    * internal page.
+    * Information needed to recreate a half-dead leaf page with correct
+    * topparent link.  The fields are only used when deletion operation's
+    * target page is an internal page.  REDO routine creates half-dead page
+    * from scratch to keep things simple (this is the same convenient
+    * approach used for the target page itself).
      */
     BlockNumber leafleftsib;
     BlockNumber leafrightsib;
-   BlockNumber topparent;      /* next child down in the subtree */
+   BlockNumber leaftopparent;  /* next child down in the subtree */
  
-   TransactionId btpo_xact;    /* value of btpo.xact for use in recovery */
     /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
  } xl_btree_unlink_page;
  
-#define SizeOfBtreeUnlinkPage  (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
+#define SizeOfBtreeUnlinkPage  (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
  
  /*
   * New root log record.  There are zero tuples if this is to establish an
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h

index 224cae0246fe1e2720fbe302b86492aacf609f98..8d09eaec93d72274d0dd418b6ceb5bfee15245d6 100644 (file)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
  /*
   * Each page of XLOG file has a header like this:
   */
-#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD10A /* can be used as WAL version indicator */
  
  typedef struct XLogPageHeaderData
  {
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h

index 94d33851d09e77b5c490b08d498bc2fa92ad0258..38fd85a4316c1db8d4a4c6ebcdcc643495206a14 100644 (file)
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void);
  
  extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
                                                 RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+                                                      RelFileNode node);
  extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
  extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
author	Peter Geoghegan <pg@bowt.ie>
	Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)
committer	Peter Geoghegan <pg@bowt.ie>
	Thu, 25 Feb 2021 02:41:34 +0000 (18:41 -0800)
contrib/amcheck/verify_nbtree.c		patch \| blob \| blame \| history
contrib/pageinspect/btreefuncs.c		patch \| blob \| blame \| history
contrib/pageinspect/expected/btree.out		patch \| blob \| blame \| history
contrib/pageinspect/pageinspect--1.8--1.9.sql		patch \| blob \| blame \| history
contrib/pgstattuple/pgstatindex.c		patch \| blob \| blame \| history
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/pageinspect.sgml		patch \| blob \| blame \| history
src/backend/access/gist/gistxlog.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtpage.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsearch.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsort.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtxlog.c		patch \| blob \| blame \| history
src/backend/access/rmgrdesc/nbtdesc.c		patch \| blob \| blame \| history
src/backend/storage/ipc/standby.c		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/access/nbtxlog.h		patch \| blob \| blame \| history
src/include/access/xlog_internal.h		patch \| blob \| blame \| history
src/include/storage/standby.h		patch \| blob \| blame \| history