bool tupleIsAlive, void *checkstate);
static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
IndexTuple itup);
+static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n);
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
OffsetNumber offset);
Page page, OffsetNumber offset);
static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
IndexTuple itup, bool nonpivot);
+static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup);
/*
* bt_index_check(index regclass, heapallindexed boolean)
if (btree_index_mainfork_expected(indrel))
{
- bool heapkeyspace;
+ bool heapkeyspace,
+ allequalimage;
RelationOpenSmgr(indrel);
if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))
RelationGetRelationName(indrel))));
/* Check index, possibly against table it is an index on */
- heapkeyspace = _bt_heapkeyspace(indrel);
+ _bt_metaversion(indrel, &heapkeyspace, &allequalimage);
bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
heapallindexed, rootdescend);
}
/*
* Size Bloom filter based on estimated number of tuples in index,
* while conservatively assuming that each block must contain at least
- * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot
- * contain non-pivot tuples. That's okay because they generally make
- * up no more than about 1% of all pages in the index.)
+ * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
+ * bt_posting_plain_tuple() for definition, and details of how posting
+ * list tuples are handled.
*/
total_pages = RelationGetNumberOfBlocks(rel);
- total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),
+ total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
(int64) state->rel->rd_rel->reltuples);
/* Random seed relies on backend srandom() call to avoid repetition */
seed = random();
size_t tupsize;
BTScanInsert skey;
bool lowersizelimit;
+ ItemPointer scantid;
CHECK_FOR_INTERRUPTS();
if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
offset))
{
+ ItemPointer tid;
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
+ tid = BTreeTupleGetPointsToTID(itup);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
/*
* Readonly callers may optionally verify that non-pivot tuples can
- * each be found by an independent search that starts from the root
+ * each be found by an independent search that starts from the root.
+ * Note that we deliberately don't do individual searches for each
+ * TID, since the posting list itself is validated by other checks.
*/
if (state->rootdescend && P_ISLEAF(topaque) &&
!bt_rootdescend(state, itup))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
- htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumber(&(itup->t_tid)),
- ItemPointerGetOffsetNumber(&(itup->t_tid)));
+ htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
(uint32) state->targetlsn)));
}
+ /*
+ * If tuple is a posting list tuple, make sure posting list TIDs are
+ * in order
+ */
+ if (BTreeTupleIsPosting(itup))
+ {
+ ItemPointerData last;
+ ItemPointer current;
+
+ ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+
+ current = BTreeTupleGetPostingN(itup, i);
+
+ if (ItemPointerCompare(current, &last) <= 0)
+ {
+ char *itid = psprintf("(%u,%u)", state->targetblock, offset);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("posting list contains misplaced TID in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.",
+ itid, i,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ }
+
+ ItemPointerCopy(current, &last);
+ }
+ }
+
/* Build insertion scankey for current page offset */
skey = bt_mkscankey_pivotsearch(state->rel, itup);
if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
BTMaxItemSizeNoHeapTid(state->target)))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
{
IndexTuple norm;
- norm = bt_normalize_tuple(state, itup);
- bloom_add_element(state->filter, (unsigned char *) norm,
- IndexTupleSize(norm));
- /* Be tidy */
- if (norm != itup)
- pfree(norm);
+ if (BTreeTupleIsPosting(itup))
+ {
+ /* Fingerprint all elements as distinct "plain" tuples */
+ for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ IndexTuple logtuple;
+
+ logtuple = bt_posting_plain_tuple(itup, i);
+ norm = bt_normalize_tuple(state, logtuple);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != logtuple)
+ pfree(norm);
+ pfree(logtuple);
+ }
+ }
+ else
+ {
+ norm = bt_normalize_tuple(state, itup);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != itup)
+ pfree(norm);
+ }
}
/*
*
* If there is a high key (if this is not the rightmost page on its
* entire level), check that high key actually is upper bound on all
- * page items.
+ * page items. If this is a posting list tuple, we'll need to set
+ * scantid to be highest TID in posting list.
*
* We prefer to check all items against high key rather than checking
* just the last and trusting that the operator class obeys the
* tuple. (See also: "Notes About Data Representation" in the nbtree
* README.)
*/
+ scantid = skey->scantid;
+ if (state->heapkeyspace && BTreeTupleIsPosting(itup))
+ skey->scantid = BTreeTupleGetMaxHeapTID(itup);
+
if (!P_RIGHTMOST(topaque) &&
!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
invariant_l_offset(state, skey, P_HIKEY)))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
+ /* Reset, in case scantid was set to (itup) posting tuple's max TID */
+ skey->scantid = scantid;
/*
* * Item order check *
if (OffsetNumberNext(offset) <= max &&
!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
{
+ ItemPointer tid;
char *itid,
*htid,
*nitid,
*nhtid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
+ tid = BTreeTupleGetPointsToTID(itup);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
nitid = psprintf("(%u,%u)", state->targetblock,
OffsetNumberNext(offset));
state->target,
OffsetNumberNext(offset));
itup = (IndexTuple) PageGetItem(state->target, itemid);
+ tid = BTreeTupleGetPointsToTID(itup);
nhtid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
* verification. In particular, it won't try to normalize opclass-equal
* datums with potentially distinct representations (e.g., btree/numeric_ops
* index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though. For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have a posting list,
+ * since dummy CREATE INDEX callback code generates new tuples with the same
+ * normalized representation.
*/
static IndexTuple
bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
IndexTuple reformed;
int i;
+ /* Caller should only pass "logical" non-pivot tuples here */
+ Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
+
/* Easy case: It's immediately clear that tuple has no varlena datums */
if (!IndexTupleHasVarwidths(itup))
return itup;
return reformed;
}
+/*
+ * Produce palloc()'d "plain" tuple for nth posting list entry/TID.
+ *
+ * In general, deduplication is not supposed to change the logical contents of
+ * an index. Multiple index tuples are merged together into one equivalent
+ * posting list index tuple when convenient.
+ *
+ * heapallindexed verification must normalize-away this variation in
+ * representation by converting posting list tuples into two or more "plain"
+ * tuples. Each tuple must be fingerprinted separately -- there must be one
+ * tuple for each corresponding Bloom filter probe during the heap scan.
+ *
+ * Note: Caller still needs to call bt_normalize_tuple() with returned tuple.
+ */
+static inline IndexTuple
+bt_posting_plain_tuple(IndexTuple itup, int n)
+{
+ Assert(BTreeTupleIsPosting(itup));
+
+ /* Returns non-posting-list tuple */
+ return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
+}
+
/*
* Search for itup in index, starting from fast root page. itup must be a
* non-pivot tuple. This is only supported with heapkeyspace indexes, since
insertstate.itup = itup;
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
insertstate.itup_key = key;
+ insertstate.postingoff = 0;
insertstate.bounds_valid = false;
insertstate.buf = lbuf;
offnum = _bt_binsrch_insert(state->rel, &insertstate);
/* Compare first >= matching item on leaf page, if any */
page = BufferGetPage(lbuf);
+ /* Should match on first heap TID when tuple has a posting list */
if (offnum <= PageGetMaxOffsetNumber(page) &&
+ insertstate.postingoff <= 0 &&
_bt_compare(state->rel, key, page, offnum) == 0)
exists = true;
_bt_relbuf(state->rel, lbuf);
}
/*
- * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
- * be present in cases where that is mandatory.
- *
- * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
- * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
- * It may become more useful in the future, when non-pivot tuples support their
- * own alternative INDEX_ALT_TID_MASK representation.
+ * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in
+ * cases where that is mandatory (i.e. for non-pivot tuples)
*/
static inline ItemPointer
BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
bool nonpivot)
{
- ItemPointer result = BTreeTupleGetHeapTID(itup);
- BlockNumber targetblock = state->targetblock;
+ ItemPointer htid;
+
+ /*
+ * Caller determines whether this is supposed to be a pivot or non-pivot
+ * tuple using page type and item offset number. Verify that tuple
+ * metadata agrees with this.
+ */
+ Assert(state->heapkeyspace);
+ if (BTreeTupleIsPivot(itup) && nonpivot)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
- if (result == NULL && nonpivot)
+ if (!BTreeTupleIsPivot(itup) && !nonpivot)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
+
+ htid = BTreeTupleGetHeapTID(itup);
+ if (!ItemPointerIsValid(htid) && nonpivot)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
- targetblock, RelationGetRelationName(state->rel))));
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
+
+ return htid;
+}
+
+/*
+ * Return the "pointed to" TID for itup, which is used to generate a
+ * descriptive error message. itup must be a "data item" tuple (it wouldn't
+ * make much sense to call here with a high key tuple, since there won't be a
+ * valid downlink/block number to display).
+ *
+ * Returns either a heap TID (which will be the first heap TID in posting list
+ * if itup is posting list tuple), or a TID that contains downlink block
+ * number, plus some encoded metadata (e.g., the number of attributes present
+ * in itup).
+ */
+static inline ItemPointer
+BTreeTupleGetPointsToTID(IndexTuple itup)
+{
+ /*
+ * Rely on the assumption that !heapkeyspace internal page data items will
+ * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
+ * recognize it as a pivot tuple, but everything still works out because
+ * the t_tid field is still returned
+ */
+ if (!BTreeTupleIsPivot(itup))
+ return BTreeTupleGetHeapTID(itup);
- return result;
+ /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
+ return &itup->t_tid;
}
<sect1 id="btree-implementation">
<title>Implementation</title>
+ <para>
+ This section covers B-Tree index implementation details that may be
+ of use to advanced users. See
+ <filename>src/backend/access/nbtree/README</filename> in the source
+ distribution for a much more detailed, internals-focused description
+ of the B-Tree implementation.
+ </para>
+ <sect2 id="btree-structure">
+ <title>B-Tree Structure</title>
+ <para>
+ <productname>PostgreSQL</productname> B-Tree indexes are
+ multi-level tree structures, where each level of the tree can be
+ used as a doubly-linked list of pages. A single metapage is stored
+ in a fixed position at the start of the first segment file of the
+ index. All other pages are either leaf pages or internal pages.
+ Leaf pages are the pages on the lowest level of the tree. All
+ other levels consist of internal pages. Each leaf page contains
+ tuples that point to table rows. Each internal page contains
+ tuples that point to the next level down in the tree. Typically,
+ over 99% of all pages are leaf pages. Both internal pages and leaf
+ pages use the standard page format described in <xref
+ linkend="storage-page-layout"/>.
+ </para>
+ <para>
+ New leaf pages are added to a B-Tree index when an existing leaf
+ page cannot fit an incoming tuple. A <firstterm>page
+ split</firstterm> operation makes room for items that originally
+ belonged on the overflowing page by moving a portion of the items
+ to a new page. Page splits must also insert a new
+ <firstterm>downlink</firstterm> to the new page in the parent page,
+ which may cause the parent to split in turn. Page splits
+ <quote>cascade upwards</quote> in a recursive fashion. When the
+ root page finally cannot fit a new downlink, a <firstterm>root page
+ split</firstterm> operation takes place. This adds a new level to
+ the tree structure by creating a new root page that is one level
+ above the original root page.
+ </para>
+ </sect2>
+
+ <sect2 id="btree-deduplication">
+ <title>Deduplication</title>
+ <para>
+ A duplicate is a leaf page tuple (a tuple that points to a table
+ row) where <emphasis>all</emphasis> indexed key columns have values
+ that match corresponding column values from at least one other leaf
+ page tuple that's close by in the same index. Duplicate tuples are
+ quite common in practice. B-Tree indexes can use a special,
+ space-efficient representation for duplicates when an optional
+ technique is enabled: <firstterm>deduplication</firstterm>.
+ </para>
+ <para>
+ Deduplication works by periodically merging groups of duplicate
+ tuples together, forming a single posting list tuple for each
+ group. The column key value(s) only appear once in this
+ representation. This is followed by a sorted array of
+ <acronym>TID</acronym>s that point to rows in the table. This
+ significantly reduces the storage size of indexes where each value
+ (or each distinct combination of column values) appears several
+ times on average. The latency of queries can be reduced
+ significantly. Overall query throughput may increase
+ significantly. The overhead of routine index vacuuming may also be
+ reduced significantly.
+ </para>
+ <note>
+ <para>
+ While NULL is generally not considered to be equal to any other
+ value, including NULL, NULL is nevertheless treated as just
+ another value from the domain of indexed values by the B-Tree
+ implementation (except when enforcing uniqueness in a unique
+ index). B-Tree deduplication is therefore just as effective with
+ <quote>duplicates</quote> that contain a NULL value.
+ </para>
+ </note>
+ <para>
+ The deduplication process occurs lazily, when a new item is
+ inserted that cannot fit on an existing leaf page. This prevents
+ (or at least delays) leaf page splits. Unlike GIN posting list
+ tuples, B-Tree posting list tuples do not need to expand every time
+ a new duplicate is inserted; they are merely an alternative
+ physical representation of the original logical contents of the
+ leaf page. This design prioritizes consistent performance with
+ mixed read-write workloads. Most client applications will at least
+ see a moderate performance benefit from using deduplication.
+ Deduplication is enabled by default.
+ </para>
+ <para>
+ Write-heavy workloads that don't benefit from deduplication due to
+ having few or no duplicate values in indexes will incur a small,
+ fixed performance penalty (unless deduplication is explicitly
+ disabled). The <literal>deduplicate_items</literal> storage
+ parameter can be used to disable deduplication within individual
+ indexes. There is never any performance penalty with read-only
+ workloads, since reading posting list tuples is at least as
+ efficient as reading the standard tuple representation. Disabling
+ deduplication isn't usually helpful.
+ </para>
+ <para>
+ B-Tree indexes are not directly aware that under MVCC, there might
+ be multiple extant versions of the same logical table row; to an
+ index, each tuple is an independent object that needs its own index
+ entry. Thus, an update of a row always creates all-new index
+ entries for the row, even if the key values did not change. Some
+ workloads suffer from index bloat caused by these
+ implementation-level version duplicates (this is typically a
+ problem for <command>UPDATE</command>-heavy workloads that cannot
+ apply the <acronym>HOT</acronym> optimization due to modifying at
+ least one indexed column). B-Tree deduplication does not
+ distinguish between these implementation-level version duplicates
+ and conventional duplicates. Deduplication can nevertheless help
+ with controlling index bloat caused by implementation-level version
+ churn.
+ </para>
+ <tip>
+ <para>
+ A special heuristic is applied to determine whether a
+ deduplication pass in a unique index should take place. It can
+ often skip straight to splitting a leaf page, avoiding a
+ performance penalty from wasting cycles on unhelpful deduplication
+ passes. If you're concerned about the overhead of deduplication,
+ consider setting <literal>deduplicate_items = off</literal>
+ selectively. Leaving deduplication enabled in unique indexes has
+ little downside.
+ </para>
+ </tip>
+ <para>
+ Deduplication cannot be used in all cases due to
+ implementation-level restrictions. Deduplication safety is
+ determined when <command>CREATE INDEX</command> or
+ <command>REINDEX</command> run.
+ </para>
+ <para>
+ Note that deduplication is deemed unsafe and cannot be used in the
+ following cases involving semantically significant differences
+ among equal datums:
+ </para>
+ <para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <type>text</type>, <type>varchar</type>, and <type>char</type>
+ cannot use deduplication when a
+ <emphasis>nondeterministic</emphasis> collation is used. Case
+ and accent differences must be preserved among equal datums.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <type>numeric</type> cannot use deduplication. Numeric display
+ scale must be preserved among equal datums.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <type>jsonb</type> cannot use deduplication, since the
+ <type>jsonb</type> B-Tree operator class uses
+ <type>numeric</type> internally.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <type>float4</type> and <type>float8</type> cannot use
+ deduplication. These types have distinct representations for
+ <literal>-0</literal> and <literal>0</literal>, which are
+ nevertheless considered equal. This difference must be
+ preserved.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ There is one further implementation-level restriction that may be
+ lifted in a future version of
+ <productname>PostgreSQL</productname>:
+ </para>
+ <para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Container types (such as composite types, arrays, or range
+ types) cannot use deduplication.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ There is one further implementation-level restriction that applies
+ regardless of the operator class or collation used:
+ </para>
<para>
- An introduction to the btree index implementation can be found in
- <filename>src/backend/access/nbtree/README</filename>.
+ <itemizedlist>
+ <listitem>
+ <para>
+ <literal>INCLUDE</literal> indexes can never use deduplication.
+ </para>
+ </listitem>
+ </itemizedlist>
</para>
+ </sect2>
</sect1>
</chapter>
nondeterministic collations give a more <quote>correct</quote> behavior,
especially when considering the full power of Unicode and its many
special cases, they also have some drawbacks. Foremost, their use leads
- to a performance penalty. Also, certain operations are not possible with
- nondeterministic collations, such as pattern matching operations.
- Therefore, they should be used only in cases where they are specifically
- wanted.
+ to a performance penalty. Note, in particular, that B-tree cannot use
+ deduplication with indexes that use a nondeterministic collation. Also,
+ certain operations are not possible with nondeterministic collations,
+ such as pattern matching operations. Therefore, they should be used
+ only in cases where they are specifically wanted.
</para>
</sect3>
</sect2>
<para>
<type>citext</type> is not as efficient as <type>text</type> because the
operator functions and the B-tree comparison functions must make copies
- of the data and convert it to lower case for comparisons. It is,
- however, slightly more efficient than using <function>lower</function> to get
- case-insensitive matching.
+ of the data and convert it to lower case for comparisons. Also, only
+ <type>text</type> can support B-Tree deduplication. However,
+ <type>citext</type> is slightly more efficient than using
+ <function>lower</function> to get case-insensitive matching.
</para>
</listitem>
rows. Two rows might have a different binary representation even
though comparisons of the two rows with the equality operator is true.
The ordering of rows under these comparison operators is deterministic
- but not otherwise meaningful. These operators are used internally for
- materialized views and might be useful for other specialized purposes
- such as replication but are not intended to be generally useful for
- writing queries.
+ but not otherwise meaningful. These operators are used internally
+ for materialized views and might be useful for other specialized
+ purposes such as replication and B-Tree deduplication (see <xref
+ linkend="btree-deduplication"/>). They are not intended to be
+ generally useful for writing queries, though.
</para>
</sect2>
</sect1>
maximum size allowed for the index type, data insertion will fail.
In any case, non-key columns duplicate data from the index's table
and bloat the size of the index, thus potentially slowing searches.
+ Furthermore, B-tree deduplication is never used with indexes
+ that have a non-key column.
</para>
<para>
</variablelist>
<para>
- B-tree indexes additionally accept this parameter:
+ B-tree indexes also accept these parameters:
</para>
<variablelist>
+ <varlistentry id="index-reloption-deduplication" xreflabel="deduplicate_items">
+ <term><literal>deduplicate_items</literal>
+ <indexterm>
+ <primary><varname>deduplicate_items</varname></primary>
+ <secondary>storage parameter</secondary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Controls usage of the B-tree deduplication technique described
+ in <xref linkend="btree-deduplication"/>. Set to
+ <literal>ON</literal> or <literal>OFF</literal> to enable or
+ disable the optimization. (Alternative spellings of
+ <literal>ON</literal> and <literal>OFF</literal> are allowed as
+ described in <xref linkend="config-setting"/>.) The default is
+ <literal>ON</literal>.
+ </para>
+
+ <note>
+ <para>
+ Turning <literal>deduplicate_items</literal> off via
+ <command>ALTER INDEX</command> prevents future insertions from
+ triggering deduplication, but does not in itself make existing
+ posting list tuples use the standard tuple representation.
+ </para>
+ </note>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="index-reloption-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
<term><literal>vacuum_cleanup_index_scale_factor</literal>
<indexterm>
This setting controls usage of the fast update technique described in
<xref linkend="gin-fast-update"/>. It is a Boolean parameter:
<literal>ON</literal> enables fast update, <literal>OFF</literal> disables it.
- (Alternative spellings of <literal>ON</literal> and <literal>OFF</literal> are
- allowed as described in <xref linkend="config-setting"/>.) The
- default is <literal>ON</literal>.
+ The default is <literal>ON</literal>.
</para>
<note>
</programlisting>
</para>
+ <para>
+ To create a B-Tree index with deduplication disabled:
+<programlisting>
+CREATE INDEX title_idx ON films (title) WITH (deduplicate_items = off);
+</programlisting>
+ </para>
+
<para>
To create an index on the expression <literal>lower(title)</literal>,
allowing efficient case-insensitive searches:
},
true
},
+ {
+ {
+ "deduplicate_items",
+ "Enables \"deduplicate items\" feature for this btree index",
+ RELOPT_KIND_BTREE,
+ ShareUpdateExclusiveLock /* since it applies only to later
+ * inserts */
+ },
+ true
+ },
/* list terminator */
{{NULL}}
};
/*
* Get the latestRemovedXid from the table entries pointed at by the index
* tuples being deleted.
+ *
+ * Note: index access methods that don't consistently use the standard
+ * IndexTuple + heap TID item pointer representation will need to provide
+ * their own version of this function.
*/
TransactionId
index_compute_xid_horizon_for_tuples(Relation irel,
OBJS = \
nbtcompare.o \
+ nbtdedup.o \
nbtinsert.o \
nbtpage.o \
nbtree.o \
like a hint bit for a heap tuple), but physically removing tuples requires
exclusive lock. In the current code we try to remove LP_DEAD tuples when
we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).
+hence have exclusive lock on it already). Deduplication can also prevent
+a page split, but removing LP_DEAD tuples is the preferred approach.
+(Note that posting list tuples can only have their LP_DEAD bit set when
+every table TID within the posting list is known dead.)
This leaves the index in a state where it has no entry for a dead tuple
that still exists in the heap. This is not a problem for the current
the fallback strategy assumes that duplicates are mostly inserted in
ascending heap TID order. The page is split in a way that leaves the left
half of the page mostly full, and the right half of the page mostly empty.
+The overall effect is that leaf page splits gracefully adapt to inserts of
+large groups of duplicates, maximizing space utilization. Note also that
+"trapping" large groups of duplicates on the same leaf page like this makes
+deduplication more efficient. Deduplication can be performed infrequently,
+without merging together existing posting list tuples too often.
+
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid (or at least delay) page splits. Note that the
+goals for deduplication in unique indexes are rather different; see later
+section for details. Deduplication alters the physical representation of
+tuples without changing the logical contents of the index, and without
+adding overhead to read queries. Non-pivot tuples are merged together
+into a single physical tuple with a posting list (a simple array of heap
+TIDs with the standard item pointer format). Deduplication is always
+applied lazily, at the point where it would otherwise be necessary to
+perform a page split. It occurs only when LP_DEAD items have been
+removed, as our last line of defense against splitting a leaf page. We
+can set the LP_DEAD bit with posting list tuples, though only when all
+TIDs are known dead.
+
+Our lazy approach to deduplication allows the page space accounting used
+during page splits to have absolutely minimal special case logic for
+posting lists. Posting lists can be thought of as extra payload that
+suffix truncation will reliably truncate away as needed during page
+splits, just like non-key columns from an INCLUDE index tuple.
+Incoming/new tuples can generally be treated as non-overlapping plain
+items (though see section on posting list splits for information about how
+overlapping new/incoming items are really handled).
+
+The representation of posting lists is almost identical to the posting
+lists used by GIN, so it would be straightforward to apply GIN's varbyte
+encoding compression scheme to individual posting lists. Posting list
+compression would break the assumptions made by posting list splits about
+page space accounting (see later section), so it's not clear how
+compression could be integrated with nbtree. Besides, posting list
+compression does not offer a compelling trade-off for nbtree, since in
+general nbtree is optimized for consistent performance with many
+concurrent readers and writers.
+
+A major goal of our lazy approach to deduplication is to limit the
+performance impact of deduplication with random updates. Even concurrent
+append-only inserts of the same key value will tend to have inserts of
+individual index tuples in an order that doesn't quite match heap TID
+order. Delaying deduplication minimizes page level fragmentation.
+
+Deduplication in unique indexes
+-------------------------------
+
+Very often, the range of values that can be placed on a given leaf page in
+a unique index is fixed and permanent. For example, a primary key on an
+identity column will usually only have page splits caused by the insertion
+of new logical rows within the rightmost leaf page. If there is a split
+of a non-rightmost leaf page, then the split must have been triggered by
+inserts associated with an UPDATE of an existing logical row. Splitting a
+leaf page purely to store multiple versions should be considered
+pathological, since it permanently degrades the index structure in order
+to absorb a temporary burst of duplicates. Deduplication in unique
+indexes helps to prevent these pathological page splits. Storing
+duplicates in a space efficient manner is not the goal, since in the long
+run there won't be any duplicates anyway. Rather, we're buying time for
+standard garbage collection mechanisms to run before a page split is
+needed.
+
+Unique index leaf pages only get a deduplication pass when an insertion
+(that might have to split the page) observed an existing duplicate on the
+page in passing. This is based on the assumption that deduplication will
+only work out when _all_ new insertions are duplicates from UPDATEs. This
+may mean that we miss an opportunity to delay a page split, but that's
+okay because our ultimate goal is to delay leaf page splits _indefinitely_
+(i.e. to prevent them altogether). There is little point in trying to
+delay a split that is probably inevitable anyway. This allows us to avoid
+the overhead of attempting to deduplicate with unique indexes that always
+have few or no duplicates.
+
+Posting list splits
+-------------------
+
+When the incoming tuple happens to overlap with an existing posting list,
+a posting list split is performed. Like a page split, a posting list
+split resolves a situation where a new/incoming item "won't fit", while
+inserting the incoming item in passing (i.e. as part of the same atomic
+action). It's possible (though not particularly likely) that an insert of
+a new item on to an almost-full page will overlap with a posting list,
+resulting in both a posting list split and a page split. Even then, the
+atomic action that splits the posting list also inserts the new item
+(since page splits always insert the new item in passing). Including the
+posting list split in the same atomic action as the insert avoids problems
+caused by concurrent inserts into the same posting list -- the exact
+details of how we change the posting list depend upon the new item, and
+vice-versa. A single atomic action also minimizes the volume of extra
+WAL required for a posting list split, since we don't have to explicitly
+WAL-log the original posting list tuple.
+
+Despite piggy-backing on the same atomic action that inserts a new tuple,
+posting list splits can be thought of as a separate, extra action to the
+insert itself (or to the page split itself). Posting list splits
+conceptually "rewrite" an insert that overlaps with an existing posting
+list into an insert that adds its final new item just to the right of the
+posting list instead. The size of the posting list won't change, and so
+page space accounting code does not need to care about posting list splits
+at all. This is an important upside of our design; the page split point
+choice logic is very subtle even without it needing to deal with posting
+list splits.
+
+Only a few isolated extra steps are required to preserve the illusion that
+the new item never overlapped with an existing posting list in the first
+place: the heap TID of the incoming tuple is swapped with the rightmost/max
+heap TID from the existing/originally overlapping posting list. Also, the
+posting-split-with-page-split case must generate a new high key based on
+an imaginary version of the original page that has both the final new item
+and the after-list-split posting tuple (page splits usually just operate
+against an imaginary version that contains the new item/item that won't
+fit).
+
+This approach avoids inventing an "eager" atomic posting split operation
+that splits the posting list without simultaneously finishing the insert
+of the incoming item. This alternative design might seem cleaner, but it
+creates subtle problems for page space accounting. In general, there
+might not be enough free space on the page to split a posting list such
+that the incoming/new item no longer overlaps with either posting list
+half --- the operation could fail before the actual retail insert of the
+new item even begins. We'd end up having to handle posting list splits
+that need a page split anyway. Besides, supporting variable "split points"
+while splitting posting lists won't actually improve overall space
+utilization.
Notes About Data Representation
-------------------------------
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ * Deduplicate items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+ Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Deduplicate items on a leaf page. The page will have to be split by caller
+ * if we cannot successfully free at least newitemsz (we also need space for
+ * newitem's line pointer, which isn't included in caller's newitemsz).
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible. Note, however, that "single
+ * value" strategy is sometimes used for !checkingunique callers, in which
+ * case deduplication will leave a few tuples untouched at the end of the
+ * page. The general idea is to prepare the page for an anticipated page
+ * split that uses nbtsplitloc.c's "single value" strategy to determine a
+ * split point. (There is no reason to deduplicate items that will end up on
+ * the right half of the page after the anticipated page split; better to
+ * handle those if and when the anticipated right half page gets its own
+ * deduplication pass, following further inserts of duplicates.)
+ *
+ * This function should be called during insertion, when the page doesn't have
+ * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag
+ * was set, caller should have removed any LP_DEAD items by calling
+ * _bt_vacuum_one_page() before calling here. We may still have to kill
+ * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
+ * but that should be rare. Also, _bt_vacuum_one_page() won't unset the
+ * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
+ * deduplication pass will always clear it, just to keep things tidy.
+ */
+void
+_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+ IndexTuple newitem, Size newitemsz, bool checkingunique)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque;
+ Page newpage;
+ int newpagendataitems = 0;
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ BTDedupState state;
+ int ndeletable = 0;
+ Size pagesaving = 0;
+ bool singlevalstrat = false;
+ int natts = IndexRelationGetNumberOfAttributes(rel);
+
+ /*
+ * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
+ * will clear the BTP_HAS_GARBAGE hint without reliably removing items
+ * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
+ * bits when deduplicating items. Allowing it would be correct, though
+ * wasteful.
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+
+ if (ItemIdIsDead(itemid))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ {
+ _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
+
+ /*
+ * Return when a split will be avoided. This is equivalent to
+ * avoiding a split using the usual _bt_vacuum_one_page() path.
+ */
+ if (PageGetFreeSpace(page) >= newitemsz)
+ return;
+
+ /*
+ * Reconsider number of items on page, in case _bt_delitems_delete()
+ * managed to delete an item or two
+ */
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /*
+ * By here, it's clear that deduplication will definitely be attempted.
+ * Initialize deduplication state.
+ *
+ * It would be possible for maxpostingsize (limit on posting list tuple
+ * size) to be set to one third of the page. However, it seems like a
+ * good idea to limit the size of posting lists to one sixth of a page.
+ * That ought to leave us with a good split point when pages full of
+ * duplicates can be split several times.
+ */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+ /* Metadata about base tuple of current pending posting list */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ /* Size of all physical tuples to be replaced by pending posting list */
+ state->phystupsize = 0;
+ /* nintervals should be initialized to zero */
+ state->nintervals = 0;
+
+ /* Determine if "single value" strategy should be used */
+ if (!checkingunique)
+ singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+ /*
+ * Deduplicate items from page, and write them to newpage.
+ *
+ * Copy the original page's LSN into newpage copy. This will become the
+ * updated version of the page. We need this because XLogInsert will
+ * examine the LSN and possibly dump it in a page image.
+ */
+ newpage = PageGetTempPageCopySpecial(page);
+ PageSetLSN(newpage, PageGetLSN(page));
+
+ /* Copy high key, if any */
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId hitemid = PageGetItemId(page, P_HIKEY);
+ Size hitemsz = ItemIdGetLength(hitemid);
+ IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+ if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /*
+ * No previous/base tuple for the data item -- use the data item
+ * as base tuple of pending posting list
+ */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (state->deduplicate &&
+ _bt_keep_natts_fast(rel, state->base, itup) > natts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID(s) for itup have been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list for some other reason (e.g., adding more
+ * TIDs would have caused posting list to exceed current
+ * maxpostingsize).
+ *
+ * If state contains pending posting list with more than one item,
+ * form new posting tuple, and actually update the page. Else
+ * reset the state and move on without modifying the page.
+ */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+ newpagendataitems++;
+
+ if (singlevalstrat)
+ {
+ /*
+ * Single value strategy's extra steps.
+ *
+ * Lower maxpostingsize for sixth and final item that might be
+ * deduplicated by current deduplication pass. When sixth
+ * item formed/observed, stop deduplicating items.
+ *
+ * Note: It's possible that this will be reached even when
+ * current deduplication pass has yet to merge together some
+ * existing items. It doesn't matter whether or not the
+ * current call generated the maxpostingsize-capped duplicate
+ * tuples at the start of the page.
+ */
+ if (newpagendataitems == 5)
+ _bt_singleval_fillfactor(page, state, newitemsz);
+ else if (newpagendataitems == 6)
+ {
+ state->deduplicate = false;
+ singlevalstrat = false; /* won't be back here */
+ }
+ }
+
+ /* itup starts new pending posting list */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ /* Handle the last item */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+ newpagendataitems++;
+
+ /*
+ * If no items suitable for deduplication were found, newpage must be
+ * exactly the same as the original page, so just return from function.
+ *
+ * We could determine whether or not to proceed on the basis the space
+ * savings being sufficient to avoid an immediate page split instead. We
+ * don't do that because there is some small value in nbtsplitloc.c always
+ * operating against a page that is fully deduplicated (apart from
+ * newitem). Besides, most of the cost has already been paid.
+ */
+ if (state->nintervals == 0)
+ {
+ /* cannot leak memory here */
+ pfree(newpage);
+ pfree(state->htids);
+ pfree(state);
+ return;
+ }
+
+ /*
+ * By here, it's clear that deduplication will definitely go ahead.
+ *
+ * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
+ * still falsely set, just to keep things tidy. (We can't rely on
+ * _bt_vacuum_one_page() having done this already, and we can't rely on a
+ * page split or VACUUM getting to it in the near future.)
+ */
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(newpage, page);
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+ xl_btree_dedup xlrec_dedup;
+
+ xlrec_dedup.nintervals = state->nintervals;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+ /*
+ * The intervals array is not in the buffer, but pretend that it is.
+ * When XLogInsert stores the whole buffer, the array need not be
+ * stored too.
+ */
+ XLogRegisterBufData(0, (char *) state->intervals,
+ state->nintervals * sizeof(BTDedupInterval));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Local space accounting should agree with page accounting */
+ Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+ /* cannot leak memory here */
+ pfree(state->htids);
+ pfree(state);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff)
+{
+ Assert(state->nhtids == 0);
+ Assert(state->nitems == 0);
+ Assert(!BTreeTupleIsPivot(base));
+
+ /*
+ * Copy heap TID(s) from new base tuple for new candidate posting list
+ * into working state's array
+ */
+ if (!BTreeTupleIsPosting(base))
+ {
+ memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+ state->nhtids = 1;
+ state->basetupsize = IndexTupleSize(base);
+ }
+ else
+ {
+ int nposting;
+
+ nposting = BTreeTupleGetNPosting(base);
+ memcpy(state->htids, BTreeTupleGetPosting(base),
+ sizeof(ItemPointerData) * nposting);
+ state->nhtids = nposting;
+ /* basetupsize should not include existing posting list */
+ state->basetupsize = BTreeTupleGetPostingOffset(base);
+ }
+
+ /*
+ * Save new base tuple itself -- it'll be needed if we actually create a
+ * new posting list from new pending posting list.
+ *
+ * Must maintain physical size of all existing tuples (including line
+ * pointer overhead) so that we can calculate space savings on page.
+ */
+ state->nitems = 1;
+ state->base = base;
+ state->baseoff = baseoff;
+ state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+ /* Also save baseoff in pending state for interval */
+ state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+ int nhtids;
+ ItemPointer htids;
+ Size mergedtupsz;
+
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ nhtids = 1;
+ htids = &itup->t_tid;
+ }
+ else
+ {
+ nhtids = BTreeTupleGetNPosting(itup);
+ htids = BTreeTupleGetPosting(itup);
+ }
+
+ /*
+ * Don't append (have caller finish pending posting list as-is) if
+ * appending heap TID(s) from itup would put us over maxpostingsize limit.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples.
+ */
+ mergedtupsz = MAXALIGN(state->basetupsize +
+ (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+ if (mergedtupsz > state->maxpostingsize)
+ return false;
+
+ /*
+ * Save heap TIDs to pending posting list tuple -- itup can be merged into
+ * pending posting list
+ */
+ state->nitems++;
+ memcpy(state->htids + state->nhtids, htids,
+ sizeof(ItemPointerData) * nhtids);
+ state->nhtids += nhtids;
+ state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+ return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead. This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+ OffsetNumber tupoff;
+ Size tuplesz;
+ Size spacesaving;
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+ if (state->nitems == 1)
+ {
+ /* Use original, unchanged base tuple */
+ tuplesz = IndexTupleSize(state->base);
+ if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ spacesaving = 0;
+ }
+ else
+ {
+ IndexTuple final;
+
+ /* Form a tuple with a posting list */
+ final = _bt_form_posting(state->base, state->htids, state->nhtids);
+ tuplesz = IndexTupleSize(final);
+ Assert(tuplesz <= state->maxpostingsize);
+
+ /* Save final number of items for posting list */
+ state->intervals[state->nintervals].nitems = state->nitems;
+
+ Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+ if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ pfree(final);
+ spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+ /* Increment nintervals, since we wrote a new posting list tuple */
+ state->nintervals++;
+ Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+ }
+
+ /* Reset state for next pending posting list */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+
+ return spacesaving;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied. The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value. When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split. The first deduplication pass should only
+ * find regular non-pivot tuples. Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over. The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples. The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive. Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes. Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ *
+ * Note: We deliberately don't bother checking if the high key is a distinct
+ * value (prior to the TID tiebreaker column) before proceeding, unlike
+ * nbtsplitloc.c. Its single value strategy only gets applied on the
+ * rightmost page of duplicates of the same value (other leaf pages full of
+ * duplicates will get a simple 50:50 page split instead of splitting towards
+ * the end of the page). There is little point in making the same distinction
+ * here.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem)
+{
+ int natts = IndexRelationGetNumberOfAttributes(rel);
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, minoff);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+ {
+ itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple. The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five. (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all. Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass. It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split. A future deduplication pass of future right
+ * sibling page might take care of it. (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+ Size leftfree;
+ int reduction;
+
+ /* This calculation needs to match nbtsplitloc.c */
+ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+ MAXALIGN(sizeof(BTPageOpaqueData));
+ /* Subtract size of new high key (includes pivot heap TID space) */
+ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+ /*
+ * Reduce maxpostingsize by an amount equal to target free space on left
+ * half of page
+ */
+ reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+ if (state->maxpostingsize > reduction)
+ state->maxpostingsize -= reduction;
+ else
+ state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple. Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order. Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+
+ if (BTreeTupleIsPosting(base))
+ keysize = BTreeTupleGetPostingOffset(base);
+ else
+ keysize = IndexTupleSize(base);
+
+ Assert(!BTreeTupleIsPivot(base));
+ Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+ Assert(keysize == MAXALIGN(keysize));
+
+ /* Determine final size of new tuple */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, base, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ memcpy(BTreeTupleGetPosting(itup), htids,
+ sizeof(ItemPointerData) * nhtids);
+ Assert(_bt_posting_valid(itup));
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ ItemPointerCopy(htids, &itup->t_tid);
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ }
+
+ return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by VACUUM. Caller's vacposting argument points to the existing
+ * posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+ IndexTuple origtuple = vacposting->itup;
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+ int nhtids;
+ int ui,
+ d;
+ ItemPointer htids;
+
+ nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+ Assert(_bt_posting_valid(origtuple));
+ Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+ if (BTreeTupleIsPosting(origtuple))
+ keysize = BTreeTupleGetPostingOffset(origtuple);
+ else
+ keysize = IndexTupleSize(origtuple);
+
+ /*
+ * Determine final size of new tuple.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples. We avoid calling _bt_form_posting() here
+ * to save ourselves a second memory allocation for a htids workspace.
+ */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, origtuple, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ htids = BTreeTupleGetPosting(itup);
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ htids = &itup->t_tid;
+ }
+
+ ui = 0;
+ d = 0;
+ for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+ {
+ if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+ {
+ d++;
+ continue;
+ }
+ htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+ }
+ Assert(ui == nhtids);
+ Assert(d == vacposting->ndeletedtids);
+ Assert(nhtids == 1 || _bt_posting_valid(itup));
+
+ /* vacposting arg's itup will now point to updated version */
+ vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff'). Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'. Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem. For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria). It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+ int nhtids;
+ char *replacepos;
+ char *replaceposright;
+ Size nmovebytes;
+ IndexTuple nposting;
+
+ nhtids = BTreeTupleGetNPosting(oposting);
+ Assert(_bt_posting_valid(oposting));
+ Assert(postingoff > 0 && postingoff < nhtids);
+
+ /*
+ * Move item pointers in posting list to make a gap for the new item's
+ * heap TID. We shift TIDs one place to the right, losing original
+ * rightmost TID. (nmovebytes must not include TIDs to the left of
+ * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+ */
+ nposting = CopyIndexTuple(oposting);
+ replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+ replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+ nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+ memmove(replaceposright, replacepos, nmovebytes);
+
+ /* Fill the gap at postingoff with TID of new item (original new TID) */
+ Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+ ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+ /* Now copy oposting's rightmost/max TID into new item (final new TID) */
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+ BTreeTupleGetHeapTID(newitem)) < 0);
+ Assert(_bt_posting_valid(nposting));
+
+ return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple. Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+ ItemPointerData last;
+ ItemPointer htid;
+
+ if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+ return false;
+
+ /* Remember first heap TID for loop */
+ ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+ if (!ItemPointerIsValid(&last))
+ return false;
+
+ /* Iterate, starting from second TID */
+ for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+ {
+ htid = BTreeTupleGetPostingN(posting, i);
+
+ if (!ItemPointerIsValid(htid))
+ return false;
+ if (ItemPointerCompare(htid, &last) <= 0)
+ return false;
+ ItemPointerCopy(htid, &last);
+ }
+
+ return true;
+}
+#endif
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
+ int postingoff,
bool split_only_page);
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
- IndexTuple newitem);
+ IndexTuple newitem, IndexTuple orignewitem,
+ IndexTuple nposting, uint16 postingoff);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only);
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
insertstate.itup_key = itup_key;
insertstate.bounds_valid = false;
insertstate.buf = InvalidBuffer;
+ insertstate.postingoff = 0;
/*
* It's very common to have an index on an auto-incremented or
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
stack, heapRel);
_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
- itup, newitemoff, false);
+ itup, newitemoff, insertstate.postingoff, false);
}
else
{
uint32 *speculativeToken)
{
IndexTuple itup = insertstate->itup;
+ IndexTuple curitup;
+ ItemId curitemid;
BTScanInsert itup_key = insertstate->itup_key;
SnapshotData SnapshotDirty;
OffsetNumber offset;
BTPageOpaque opaque;
Buffer nbuf = InvalidBuffer;
bool found = false;
+ bool inposting = false;
+ bool prevalldead = true;
+ int curposti = 0;
/* Assume unique until we find a duplicate */
*is_unique = true;
Assert(itup_key->scantid == NULL);
for (;;)
{
- ItemId curitemid;
- IndexTuple curitup;
- BlockNumber nblkno;
-
/*
- * make sure the offset points to an actual item before trying to
- * examine it...
+ * Each iteration of the loop processes one heap TID, not one index
+ * tuple. Current offset number for page isn't usually advanced on
+ * iterations that process heap TIDs from posting list tuples.
+ *
+ * "inposting" state is set when _inside_ a posting list --- not when
+ * we're at the start (or end) of a posting list. We advance curposti
+ * at the end of the iteration when inside a posting list tuple. In
+ * general, every loop iteration either advances the page offset or
+ * advances curposti --- an iteration that handles the rightmost/max
+ * heap TID in a posting list finally advances the page offset (and
+ * unsets "inposting").
+ *
+ * Make sure the offset points to an actual index tuple before trying
+ * to examine it...
*/
if (offset <= maxoff)
{
break;
}
- curitemid = PageGetItemId(page, offset);
-
/*
- * We can skip items that are marked killed.
+ * We can skip items that are already marked killed.
*
* In the presence of heavy update activity an index may contain
* many killed items with the same key; running _bt_compare() on
* each killed item gets expensive. Just advance over killed
* items as quickly as we can. We only apply _bt_compare() when
- * we get to a non-killed item. Even those comparisons could be
- * avoided (in the common case where there is only one page to
- * visit) by reusing bounds, but just skipping dead items is fast
- * enough.
+ * we get to a non-killed item. We could reuse the bounds to
+ * avoid _bt_compare() calls for known equal tuples, but it
+ * doesn't seem worth it. Workloads with heavy update activity
+ * tend to have many deduplication passes, so we'll often avoid
+ * most of those comparisons, too (we call _bt_compare() when the
+ * posting list tuple is initially encountered, though not when
+ * processing later TIDs from the same tuple).
*/
- if (!ItemIdIsDead(curitemid))
+ if (!inposting)
+ curitemid = PageGetItemId(page, offset);
+ if (inposting || !ItemIdIsDead(curitemid))
{
ItemPointerData htid;
bool all_dead;
- if (_bt_compare(rel, itup_key, page, offset) != 0)
- break; /* we're past all the equal tuples */
+ if (!inposting)
+ {
+ /* Plain tuple, or first TID in posting list tuple */
+ if (_bt_compare(rel, itup_key, page, offset) != 0)
+ break; /* we're past all the equal tuples */
- /* okay, we gotta fetch the heap tuple ... */
- curitup = (IndexTuple) PageGetItem(page, curitemid);
- htid = curitup->t_tid;
+ /* Advanced curitup */
+ curitup = (IndexTuple) PageGetItem(page, curitemid);
+ Assert(!BTreeTupleIsPivot(curitup));
+ }
+
+ /* okay, we gotta fetch the heap tuple using htid ... */
+ if (!BTreeTupleIsPosting(curitup))
+ {
+ /* ... htid is from simple non-pivot tuple */
+ Assert(!inposting);
+ htid = curitup->t_tid;
+ }
+ else if (!inposting)
+ {
+ /* ... htid is first TID in new posting list */
+ inposting = true;
+ prevalldead = true;
+ curposti = 0;
+ htid = *BTreeTupleGetPostingN(curitup, 0);
+ }
+ else
+ {
+ /* ... htid is second or subsequent TID in posting list */
+ Assert(curposti > 0);
+ htid = *BTreeTupleGetPostingN(curitup, curposti);
+ }
/*
* If we are doing a recheck, we expect to find the tuple we
* not part of this chain because it had a different index
* entry.
*/
- htid = itup->t_tid;
- if (table_index_fetch_tuple_check(heapRel, &htid,
+ if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
SnapshotSelf, NULL))
{
/* Normal case --- it's still live */
RelationGetRelationName(rel))));
}
}
- else if (all_dead)
+ else if (all_dead && (!inposting ||
+ (prevalldead &&
+ curposti == BTreeTupleGetNPosting(curitup) - 1)))
{
/*
- * The conflicting tuple (or whole HOT chain) is dead to
- * everyone, so we may as well mark the index entry
- * killed.
+ * The conflicting tuple (or all HOT chains pointed to by
+ * all posting list TIDs) is dead to everyone, so mark the
+ * index entry killed.
*/
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
else
MarkBufferDirtyHint(insertstate->buf, true);
}
+
+ /*
+ * Remember if posting list tuple has even a single HOT chain
+ * whose members are not all dead
+ */
+ if (!all_dead && inposting)
+ prevalldead = false;
}
}
- /*
- * Advance to next tuple to continue checking.
- */
- if (offset < maxoff)
+ if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+ {
+ /* Advance to next TID in same posting list */
+ curposti++;
+ continue;
+ }
+ else if (offset < maxoff)
+ {
+ /* Advance to next tuple */
+ curposti = 0;
+ inposting = false;
offset = OffsetNumberNext(offset);
+ }
else
{
int highkeycmp;
/* Advance to next non-dead page --- there must be one */
for (;;)
{
- nblkno = opaque->btpo_next;
+ BlockNumber nblkno = opaque->btpo_next;
+
nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
page = BufferGetPage(nbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
}
+ /* Will also advance to next tuple */
+ curposti = 0;
+ inposting = false;
maxoff = PageGetMaxOffsetNumber(page);
offset = P_FIRSTDATAKEY(opaque);
/* Don't invalidate binary search bounds */
BTScanInsert itup_key = insertstate->itup_key;
Page page = BufferGetPage(insertstate->buf);
BTPageOpaque lpageop;
+ OffsetNumber newitemoff;
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(!insertstate->bounds_valid || checkingunique);
Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+ Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
if (itup_key->heapkeyspace)
{
+ /* Keep track of whether checkingunique duplicate seen */
+ bool uniquedup = false;
+
/*
* If we're inserting into a unique index, we may have to walk right
* through leaf pages to find the one leaf page that we must insert on
*/
if (checkingunique)
{
+ if (insertstate->low < insertstate->stricthigh)
+ {
+ /* Encountered a duplicate in _bt_check_unique() */
+ Assert(insertstate->bounds_valid);
+ uniquedup = true;
+ }
+
for (;;)
{
/*
/* Update local state after stepping right */
page = BufferGetPage(insertstate->buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* Assume duplicates (if checkingunique) */
+ uniquedup = true;
}
}
/*
* If the target page is full, see if we can obtain enough space by
- * erasing LP_DEAD items
+ * erasing LP_DEAD items. If that fails to free enough space, see if
+ * we can avoid a page split by performing a deduplication pass over
+ * the page.
+ *
+ * We only perform a deduplication pass for a checkingunique caller
+ * when the incoming item is a duplicate of an existing item on the
+ * leaf page. This heuristic avoids wasting cycles -- we only expect
+ * to benefit from deduplicating a unique index page when most or all
+ * recently added items are duplicates. See nbtree/README.
*/
- if (PageGetFreeSpace(page) < insertstate->itemsz &&
- P_HAS_GARBAGE(lpageop))
+ if (PageGetFreeSpace(page) < insertstate->itemsz)
{
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
- insertstate->bounds_valid = false;
+ if (P_HAS_GARBAGE(lpageop))
+ {
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+ insertstate->bounds_valid = false;
+
+ /* Might as well assume duplicates (if checkingunique) */
+ uniquedup = true;
+ }
+
+ if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
+ (!checkingunique || uniquedup) &&
+ PageGetFreeSpace(page) < insertstate->itemsz)
+ {
+ _bt_dedup_one_page(rel, insertstate->buf, heapRel,
+ insertstate->itup, insertstate->itemsz,
+ checkingunique);
+ insertstate->bounds_valid = false;
+ }
}
}
else
Assert(P_RIGHTMOST(lpageop) ||
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
- return _bt_binsrch_insert(rel, insertstate);
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+ if (insertstate->postingoff == -1)
+ {
+ /*
+ * There is an overlapping posting list tuple with its LP_DEAD bit
+ * set. We don't want to unnecessarily unset its LP_DEAD bit while
+ * performing a posting list split, so delete all LP_DEAD items early.
+ * This is the only case where LP_DEAD deletes happen even though
+ * there is space for newitem on the page.
+ */
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+
+ /*
+ * Do new binary search. New insert location cannot overlap with any
+ * posting list now.
+ */
+ insertstate->bounds_valid = false;
+ insertstate->postingoff = 0;
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+ Assert(insertstate->postingoff == 0);
+ }
+
+ return newitemoff;
}
/*
*
* This recursive procedure does the following things:
*
+ * + if postingoff != 0, splits existing posting list tuple
+ * (since it overlaps with new 'itup' tuple).
* + if necessary, splits the target page, using 'itup_key' for
* suffix truncation on leaf pages (caller passes NULL for
* non-leaf pages).
- * + inserts the tuple.
+ * + inserts the new tuple (might be split from posting list).
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
+ int postingoff,
bool split_only_page)
{
Page page;
BTPageOpaque lpageop;
Size itemsz;
+ IndexTuple oposting;
+ IndexTuple origitup = NULL;
+ IndexTuple nposting = NULL;
page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISLEAF(lpageop) ||
BTreeTupleGetNAtts(itup, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(!BTreeTupleIsPosting(itup));
/* The caller should've finished any incomplete splits already. */
if (P_INCOMPLETE_SPLIT(lpageop))
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
+ /*
+ * Do we need to split an existing posting list item?
+ */
+ if (postingoff != 0)
+ {
+ ItemId itemid = PageGetItemId(page, newitemoff);
+
+ /*
+ * The new tuple is a duplicate with a heap TID that falls inside the
+ * range of an existing posting list tuple on a leaf page. Prepare to
+ * split an existing posting list. Overwriting the posting list with
+ * its post-split version is treated as an extra step in either the
+ * insert or page split critical section.
+ */
+ Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
+ Assert(itup_key->heapkeyspace && itup_key->allequalimage);
+ oposting = (IndexTuple) PageGetItem(page, itemid);
+
+ /* use a mutable copy of itup as our itup from here on */
+ origitup = itup;
+ itup = CopyIndexTuple(origitup);
+ nposting = _bt_swap_posting(itup, oposting, postingoff);
+ /* itup now contains rightmost/max TID from oposting */
+
+ /* Alter offset so that newitem goes after posting list */
+ newitemoff = OffsetNumberNext(newitemoff);
+ }
+
/*
* Do we need to split the page to fit the item on it?
*
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* split the buffer into left and right halves */
- rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+ rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+ origitup, nposting, postingoff);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf));
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
+ if (postingoff != 0)
+ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
elog(PANIC, "failed to add new item to block %u in index \"%s\"",
itup_blkno, RelationGetRelationName(rel));
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
- if (P_ISLEAF(lpageop))
+ if (P_ISLEAF(lpageop) && postingoff == 0)
+ {
+ /* Simple leaf insert */
xlinfo = XLOG_BTREE_INSERT_LEAF;
+ }
+ else if (postingoff != 0)
+ {
+ /*
+ * Leaf insert with posting list split. Must include
+ * postingoff field before newitem/orignewitem.
+ */
+ xlinfo = XLOG_BTREE_INSERT_POST;
+ }
else
{
/*
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;
+ xlmeta.allequalimage = metad->btm_allequalimage;
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
}
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
- XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ if (postingoff == 0)
+ {
+ /* Simple, common case -- log itup from caller */
+ XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ }
+ else
+ {
+ /*
+ * Insert with posting list split (XLOG_BTREE_INSERT_POST
+ * record) case.
+ *
+ * Log postingoff. Also log origitup, not itup. REDO routine
+ * must reconstruct final itup (as well as nposting) using
+ * _bt_swap_posting().
+ */
+ uint16 upostingoff = postingoff;
+
+ XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+ XLogRegisterBufData(0, (char *) origitup,
+ IndexTupleSize(origitup));
+ }
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
RelationSetTargetBlock(rel, cachedBlock);
}
+
+ /* be tidy */
+ if (postingoff != 0)
+ {
+ /* itup is actually a modified copy of caller's original */
+ pfree(nposting);
+ pfree(itup);
+ }
}
/*
* This function will clear the INCOMPLETE_SPLIT flag on it, and
* release the buffer.
*
+ * orignewitem, nposting, and postingoff are needed when an insert of
+ * orignewitem results in both a posting list split and a page split.
+ * These extra posting list split details are used here in the same
+ * way as they are used in the more common case where a posting list
+ * split does not coincide with a page split. We need to deal with
+ * posting list splits directly in order to ensure that everything
+ * that follows from the insert of orignewitem is handled as a single
+ * atomic operation (though caller's insert of a new pivot/downlink
+ * into parent page will still be a separate operation). See
+ * nbtree/README for details on the design of posting list splits.
+ *
* Returns the new right sibling of buf, pinned and write-locked.
* The pin and lock on buf are maintained.
*/
static Buffer
_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
- OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+ OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
{
Buffer rbuf;
Page origpage;
OffsetNumber leftoff,
rightoff;
OffsetNumber firstright;
+ OffsetNumber origpagepostingoff;
OffsetNumber maxoff;
OffsetNumber i;
bool newitemonleft,
PageSetLSN(leftpage, PageGetLSN(origpage));
isleaf = P_ISLEAF(oopaque);
+ /*
+ * Determine page offset number of existing overlapped-with-orignewitem
+ * posting list when it is necessary to perform a posting list split in
+ * passing. Note that newitem was already changed by caller (newitem no
+ * longer has the orignewitem TID).
+ *
+ * This page offset number (origpagepostingoff) will be used to pretend
+ * that the posting split has already taken place, even though the
+ * required modifications to origpage won't occur until we reach the
+ * critical section. The lastleft and firstright tuples of our page split
+ * point should, in effect, come from an imaginary version of origpage
+ * that has the nposting tuple instead of the original posting list tuple.
+ *
+ * Note: _bt_findsplitloc() should have compensated for coinciding posting
+ * list splits in just the same way, at least in theory. It doesn't
+ * bother with that, though. In practice it won't affect its choice of
+ * split point.
+ */
+ origpagepostingoff = InvalidOffsetNumber;
+ if (postingoff != 0)
+ {
+ Assert(isleaf);
+ Assert(ItemPointerCompare(&orignewitem->t_tid,
+ &newitem->t_tid) < 0);
+ Assert(BTreeTupleIsPosting(nposting));
+ origpagepostingoff = OffsetNumberPrev(newitemoff);
+ }
+
/*
* The "high key" for the new left page will be the first key that's going
* to go into the new right page, or a truncated version if this is a leaf
itemid = PageGetItemId(origpage, firstright);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ if (firstright == origpagepostingoff)
+ item = nposting;
}
/*
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
itemid = PageGetItemId(origpage, lastleftoff);
lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+ if (lastleftoff == origpagepostingoff)
+ lastleft = nposting;
}
Assert(lastleft != item);
*/
leftoff = P_HIKEY;
+ Assert(BTreeTupleIsPivot(lefthikey) || !itup_key->heapkeyspace);
Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ Assert(BTreeTupleIsPivot(item) || !itup_key->heapkeyspace);
Assert(BTreeTupleGetNAtts(item, rel) > 0);
Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ /* replace original item with nposting due to posting split? */
+ if (i == origpagepostingoff)
+ {
+ Assert(BTreeTupleIsPosting(item));
+ Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+ item = nposting;
+ }
+
/* does new item belong before this one? */
- if (i == newitemoff)
+ else if (i == newitemoff)
{
if (newitemonleft)
{
XLogRecPtr recptr;
xlrec.level = ropaque->btpo.level;
+ /* See comments below on newitem, orignewitem, and posting lists */
xlrec.firstright = firstright;
xlrec.newitemoff = newitemoff;
+ xlrec.postingoff = 0;
+ if (postingoff != 0 && origpagepostingoff < firstright)
+ xlrec.postingoff = postingoff;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
* because it's included with all the other items on the right page.)
* Show the new item as belonging to the left page buffer, so that it
* is not stored if XLogInsert decides it needs a full-page image of
- * the left page. We store the offset anyway, though, to support
- * archive compression of these records.
+ * the left page. We always store newitemoff in the record, though.
+ *
+ * The details are sometimes slightly different for page splits that
+ * coincide with a posting list split. If both the replacement
+ * posting list and newitem go on the right page, then we don't need
+ * to log anything extra, just like the simple !newitemonleft
+ * no-posting-split case (postingoff is set to zero in the WAL record,
+ * so recovery doesn't need to process a posting list split at all).
+ * Otherwise, we set postingoff and log orignewitem instead of
+ * newitem, despite having actually inserted newitem. REDO routine
+ * must reconstruct nposting and newitem using _bt_swap_posting().
+ *
+ * Note: It's possible that our page split point is the point that
+ * makes the posting list lastleft and newitem firstright. This is
+ * the only case where we log orignewitem/newitem despite newitem
+ * going on the right page. If XLogInsert decides that it can omit
+ * orignewitem due to logging a full-page image of the left page,
+ * everything still works out, since recovery only needs to log
+ * orignewitem for items on the left page (just like the regular
+ * newitem-logged case).
*/
- if (newitemonleft)
+ if (newitemonleft && xlrec.postingoff == 0)
XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+ else if (xlrec.postingoff != 0)
+ {
+ Assert(newitemonleft || firstright == newitemoff);
+ Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem));
+ XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz));
+ }
/* Log the left page's new high key */
itemid = PageGetItemId(origpage, P_HIKEY);
/* Recursively insert into the parent */
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
- new_item, stack->bts_offset + 1,
+ new_item, stack->bts_offset + 1, 0,
is_only);
/* be tidy */
md.fastlevel = metad->btm_level;
md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
static void
_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
{
- OffsetNumber deletable[MaxOffsetNumber];
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
OffsetNumber offnum,
minoff,
* Note: if we didn't find any LP_DEAD items, then the page's
* BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
* separate write to clear it, however. We will clear it when we split
- * the page.
+ * the page, or when deduplication runs.
*/
}
#include "access/nbtree.h"
#include "access/nbtxlog.h"
+#include "access/tableam.h"
#include "access/transam.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
bool *rightsib_empty);
+static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+ OffsetNumber *deletable, int ndeletable);
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
BTStack stack, Buffer *topparent, OffsetNumber *topoff,
BlockNumber *target, BlockNumber *rightsib);
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
*/
void
-_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage)
{
BTMetaPageData *metad;
BTPageOpaque metaopaque;
metad->btm_fastlevel = level;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ metad->btm_allequalimage = allequalimage;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META;
metad->btm_version = BTREE_NOVAC_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ /* Only a REINDEX can set this field */
+ Assert(!metad->btm_allequalimage);
+ metad->btm_allequalimage = false;
/* Adjust pd_lower (see _bt_initmetapage() for details) */
((PageHeader) page)->pd_lower =
md.fastlevel = metad->btm_fastlevel;
md.oldest_btpo_xact = oldestBtpoXact;
md.last_cleanup_num_heap_tuples = numHeapTuples;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot;
md.fastlevel = 0;
md.oldest_btpo_xact = InvalidTransactionId;
md.last_cleanup_num_heap_tuples = -1.0;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/*
- * _bt_heapkeyspace() -- is heap TID being treated as a key?
+ * _bt_metaversion() -- Get version/status info from metapage.
+ *
+ * Sets caller's *heapkeyspace and *allequalimage arguments using data
+ * from the B-Tree metapage (could be locally-cached version). This
+ * information needs to be stashed in insertion scankey, so we provide a
+ * single function that fetches both at once.
*
* This is used to determine the rules that must be used to descend a
* btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
* pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
* performance when inserting a new BTScanInsert-wise duplicate tuple
* among many leaf pages already full of such duplicates.
+ *
+ * Also sets allequalimage field, which indicates whether or not it is
+ * safe to apply deduplication. We rely on the assumption that
+ * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
+ * pg_upgrade'd from Postgres 12.
*/
-bool
-_bt_heapkeyspace(Relation rel)
+void
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
{
BTMetaPageData *metad;
*/
if (metad->btm_root == P_NONE)
{
- uint32 btm_version = metad->btm_version;
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
_bt_relbuf(rel, metabuf);
- return btm_version > BTREE_NOVAC_VERSION;
+ return;
}
/*
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_fastroot != P_NONE);
- return metad->btm_version > BTREE_NOVAC_VERSION;
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
}
/*
* Delete item(s) from a btree leaf page during VACUUM.
*
* This routine assumes that the caller has a super-exclusive write lock on
- * the buffer. Also, the given deletable array *must* be sorted in ascending
- * order.
+ * the buffer. Also, the given deletable and updatable arrays *must* be
+ * sorted in ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed by VACUUM. This works
+ * by updating/overwriting an existing item with caller's new version of the
+ * item (a version that lacks the TIDs that are to be deleted).
*
* We record VACUUMs and b-tree deletes differently in WAL. Deletes must
* generate their own latestRemovedXid by accessing the heap directly, whereas
- * VACUUMs rely on the initial heap scan taking care of it indirectly.
+ * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
+ * only VACUUM can perform granular deletes of individual TIDs in posting list
+ * tuples.
*/
void
_bt_delitems_vacuum(Relation rel, Buffer buf,
- OffsetNumber *deletable, int ndeletable)
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
+ Size itemsz;
+ char *updatedbuf = NULL;
+ Size updatedbuflen = 0;
+ OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
/* Shouldn't be called unless there's something to do */
- Assert(ndeletable > 0);
+ Assert(ndeletable > 0 || nupdatable > 0);
+
+ for (int i = 0; i < nupdatable; i++)
+ {
+ /* Replace work area IndexTuple with updated version */
+ _bt_update_posting(updatable[i]);
+
+ /* Maintain array of updatable page offsets for WAL record */
+ updatedoffsets[i] = updatable[i]->updatedoffset;
+ }
+
+ /* XLOG stuff -- allocate and fill buffer before critical section */
+ if (nupdatable > 0 && RelationNeedsWAL(rel))
+ {
+ Size offset = 0;
+
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+
+ itemsz = SizeOfBtreeUpdate +
+ vacposting->ndeletedtids * sizeof(uint16);
+ updatedbuflen += itemsz;
+ }
+
+ updatedbuf = palloc(updatedbuflen);
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+ xl_btree_update update;
+
+ update.ndeletedtids = vacposting->ndeletedtids;
+ memcpy(updatedbuf + offset, &update.ndeletedtids,
+ SizeOfBtreeUpdate);
+ offset += SizeOfBtreeUpdate;
+
+ itemsz = update.ndeletedtids * sizeof(uint16);
+ memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+ offset += itemsz;
+ }
+ }
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
- /* Fix the page */
- PageIndexMultiDelete(page, deletable, ndeletable);
+ /*
+ * Handle posting tuple updates.
+ *
+ * Deliberately do this before handling simple deletes. If we did it the
+ * other way around (i.e. WAL record order -- simple deletes before
+ * updates) then we'd have to make compensating changes to the 'updatable'
+ * array of offset numbers.
+ *
+ * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
+ * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
+ * level flag, unsetting individual LP_DEAD bits should still be avoided.
+ */
+ for (int i = 0; i < nupdatable; i++)
+ {
+ OffsetNumber updatedoffset = updatedoffsets[i];
+ IndexTuple itup;
+
+ itup = updatable[i]->itup;
+ itemsz = MAXALIGN(IndexTupleSize(itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+ itemsz))
+ elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+ BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ }
+
+ /* Now handle simple deletes of entire tuples */
+ if (ndeletable > 0)
+ PageIndexMultiDelete(page, deletable, ndeletable);
/*
* We can clear the vacuum cycle ID since this page has certainly been
* limited, since we never falsely unset an LP_DEAD bit. Workloads that
* are particularly dependent on LP_DEAD bits being set quickly will
* usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
- * again anyway.
+ * again anyway. Furthermore, attempting a deduplication pass will remove
+ * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
+ * is set or not.
*/
opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
xl_btree_vacuum xlrec_vacuum;
xlrec_vacuum.ndeleted = ndeletable;
+ xlrec_vacuum.nupdated = nupdatable;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
- /*
- * The deletable array is not in the buffer, but pretend that it is.
- * When XLogInsert stores the whole buffer, the array need not be
- * stored too.
- */
- XLogRegisterBufData(0, (char *) deletable,
- ndeletable * sizeof(OffsetNumber));
+ if (ndeletable > 0)
+ XLogRegisterBufData(0, (char *) deletable,
+ ndeletable * sizeof(OffsetNumber));
+
+ if (nupdatable > 0)
+ {
+ XLogRegisterBufData(0, (char *) updatedoffsets,
+ nupdatable * sizeof(OffsetNumber));
+ XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+ }
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
}
END_CRIT_SECTION();
+
+ /* can't leak memory here */
+ if (updatedbuf != NULL)
+ pfree(updatedbuf);
+ /* free tuples generated by calling _bt_update_posting() */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]->itup);
}
/*
* This is nearly the same as _bt_delitems_vacuum as far as what it does to
* the page, but it needs to generate its own latestRemovedXid by accessing
* the heap. This is used by the REDO routine to generate recovery conflicts.
+ * Also, it doesn't handle posting list tuples unless the entire tuple can be
+ * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
*/
void
_bt_delitems_delete(Relation rel, Buffer buf,
if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
latestRemovedXid =
- index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
- deletable, ndeletable);
+ _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
END_CRIT_SECTION();
}
+/*
+ * Get the latestRemovedXid from the table entries pointed to by the non-pivot
+ * tuples being deleted.
+ *
+ * This is a specialized version of index_compute_xid_horizon_for_tuples().
+ * It's needed because btree tuples don't always store table TID using the
+ * standard index tuple header field.
+ */
+static TransactionId
+_bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+ OffsetNumber *deletable, int ndeletable)
+{
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int spacenhtids;
+ int nhtids;
+ ItemPointer htids;
+
+ /* Array will grow iff there are posting list tuples to consider */
+ spacenhtids = ndeletable;
+ nhtids = 0;
+ htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
+ for (int i = 0; i < ndeletable; i++)
+ {
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, deletable[i]);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(ItemIdIsDead(itemid));
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ if (nhtids + 1 > spacenhtids)
+ {
+ spacenhtids *= 2;
+ htids = (ItemPointer)
+ repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+ }
+
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
+ nhtids++;
+ }
+ else
+ {
+ int nposting = BTreeTupleGetNPosting(itup);
+
+ if (nhtids + nposting > spacenhtids)
+ {
+ spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
+ htids = (ItemPointer)
+ repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+ }
+
+ for (int j = 0; j < nposting; j++)
+ {
+ ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+
+ Assert(ItemPointerIsValid(htid));
+ ItemPointerCopy(htid, &htids[nhtids]);
+ nhtids++;
+ }
+ }
+ }
+
+ Assert(nhtids >= ndeletable);
+
+ latestRemovedXid =
+ table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
+
+ pfree(htids);
+
+ return latestRemovedXid;
+}
+
/*
* Returns true, if the given block has the half-dead flag set.
*/
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+ xlmeta.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
+static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
+ IndexTuple posting,
+ OffsetNumber updatedoffset,
+ int *nremaining);
/*
/* Construct metapage. */
metapage = (Page) palloc(BLCKSZ);
- _bt_initmetapage(metapage, P_NONE, 0);
+ _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
/*
* Write the page and log it. It might seem that an immediate sync would
*/
if (so->killedItems == NULL)
so->killedItems = (int *)
- palloc(MaxIndexTuplesPerPage * sizeof(int));
- if (so->numKilled < MaxIndexTuplesPerPage)
+ palloc(MaxTIDsPerBTreePage * sizeof(int));
+ if (so->numKilled < MaxTIDsPerBTreePage)
so->killedItems[so->numKilled++] = so->currPos.itemIndex;
}
}
else if (P_ISLEAF(opaque))
{
- OffsetNumber deletable[MaxOffsetNumber];
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable;
+ BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+ int nupdatable;
OffsetNumber offnum,
minoff,
maxoff;
+ int nhtidsdead,
+ nhtidslive;
/*
* Trade in the initial read lock for a super-exclusive write lock on
* point using callback.
*/
ndeletable = 0;
+ nupdatable = 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
+ nhtidsdead = 0;
+ nhtidslive = 0;
if (callback)
{
for (offnum = minoff;
offnum = OffsetNumberNext(offnum))
{
IndexTuple itup;
- ItemPointer htup;
itup = (IndexTuple) PageGetItem(page,
PageGetItemId(page, offnum));
- htup = &(itup->t_tid);
/*
* Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
* simple, and allows us to always avoid generating our own
* conflicts.
*/
- if (callback(htup, callback_state))
- deletable[ndeletable++] = offnum;
+ Assert(!BTreeTupleIsPivot(itup));
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Regular tuple, standard table TID representation */
+ if (callback(&itup->t_tid, callback_state))
+ {
+ deletable[ndeletable++] = offnum;
+ nhtidsdead++;
+ }
+ else
+ nhtidslive++;
+ }
+ else
+ {
+ BTVacuumPosting vacposting;
+ int nremaining;
+
+ /* Posting list tuple */
+ vacposting = btreevacuumposting(vstate, itup, offnum,
+ &nremaining);
+ if (vacposting == NULL)
+ {
+ /*
+ * All table TIDs from the posting tuple remain, so no
+ * delete or update required
+ */
+ Assert(nremaining == BTreeTupleGetNPosting(itup));
+ }
+ else if (nremaining > 0)
+ {
+
+ /*
+ * Store metadata about posting list tuple in
+ * updatable array for entire page. Existing tuple
+ * will be updated during the later call to
+ * _bt_delitems_vacuum().
+ */
+ Assert(nremaining < BTreeTupleGetNPosting(itup));
+ updatable[nupdatable++] = vacposting;
+ nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+ }
+ else
+ {
+ /*
+ * All table TIDs from the posting list must be
+ * deleted. We'll delete the index tuple completely
+ * (no update required).
+ */
+ Assert(nremaining == 0);
+ deletable[ndeletable++] = offnum;
+ nhtidsdead += BTreeTupleGetNPosting(itup);
+ pfree(vacposting);
+ }
+
+ nhtidslive += nremaining;
+ }
}
}
/*
- * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
- * call per page, so as to minimize WAL traffic.
+ * Apply any needed deletes or updates. We issue just one
+ * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
*/
- if (ndeletable > 0)
+ if (ndeletable > 0 || nupdatable > 0)
{
- _bt_delitems_vacuum(rel, buf, deletable, ndeletable);
+ Assert(nhtidsdead >= Max(ndeletable, 1));
+ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+ nupdatable);
- stats->tuples_removed += ndeletable;
+ stats->tuples_removed += nhtidsdead;
/* must recompute maxoff */
maxoff = PageGetMaxOffsetNumber(page);
+
+ /* can't leak memory here */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]);
}
else
{
* We treat this like a hint-bit update because there's no need to
* WAL-log it.
*/
+ Assert(nhtidsdead == 0);
if (vstate->cycleid != 0 &&
opaque->btpo_cycleid == vstate->cycleid)
{
}
/*
- * If it's now empty, try to delete; else count the live tuples. We
- * don't delete when recursing, though, to avoid putting entries into
- * freePages out-of-order (doesn't seem worth any extra code to handle
- * the case).
+ * If it's now empty, try to delete; else count the live tuples (live
+ * table TIDs in posting lists are counted as separate live tuples).
+ * We don't delete when recursing, though, to avoid putting entries
+ * into freePages out-of-order (doesn't seem worth any extra code to
+ * handle the case).
*/
if (minoff > maxoff)
delete_now = (blkno == orig_blkno);
else
- stats->num_index_tuples += maxoff - minoff + 1;
+ stats->num_index_tuples += nhtidslive;
+
+ Assert(!delete_now || nhtidslive == 0);
}
if (delete_now)
/*
* This is really tail recursion, but if the compiler is too stupid to
* optimize it as such, we'd eat an uncomfortably large amount of stack
- * space per recursion level (due to the deletable[] array). A failure is
- * improbable since the number of levels isn't likely to be large ... but
- * just in case, let's hand-optimize into a loop.
+ * space per recursion level (due to the arrays used to track details of
+ * deletable/updatable items). A failure is improbable since the number
+ * of levels isn't likely to be large ... but just in case, let's
+ * hand-optimize into a loop.
*/
if (recurse_to != P_NONE)
{
}
}
+/*
+ * btreevacuumposting --- determine TIDs still needed in posting list
+ *
+ * Returns metadata describing how to build replacement tuple without the TIDs
+ * that VACUUM needs to delete. Returned value is NULL in the common case
+ * where no changes are needed to caller's posting list tuple (we avoid
+ * allocating memory here as an optimization).
+ *
+ * The number of TIDs that should remain in the posting list tuple is set for
+ * caller in *nremaining.
+ */
+static BTVacuumPosting
+btreevacuumposting(BTVacState *vstate, IndexTuple posting,
+ OffsetNumber updatedoffset, int *nremaining)
+{
+ int live = 0;
+ int nitem = BTreeTupleGetNPosting(posting);
+ ItemPointer items = BTreeTupleGetPosting(posting);
+ BTVacuumPosting vacposting = NULL;
+
+ for (int i = 0; i < nitem; i++)
+ {
+ if (!vstate->callback(items + i, vstate->callback_state))
+ {
+ /* Live table TID */
+ live++;
+ }
+ else if (vacposting == NULL)
+ {
+ /*
+ * First dead table TID encountered.
+ *
+ * It's now clear that we need to delete one or more dead table
+ * TIDs, so start maintaining metadata describing how to update
+ * existing posting list tuple.
+ */
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ nitem * sizeof(uint16));
+
+ vacposting->itup = posting;
+ vacposting->updatedoffset = updatedoffset;
+ vacposting->ndeletedtids = 0;
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ else
+ {
+ /* Second or subsequent dead table TID */
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ }
+
+ *nremaining = live;
+ return vacposting;
+}
+
/*
* btcanreturn() -- Check whether btree indexes support index-only scans.
*
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int _bt_binsrch_posting(BTScanInsert key, Page page,
+ OffsetNumber offnum);
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
+static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, ItemPointer heapTid,
+ IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
offnum = _bt_binsrch(rel, key, *bufP);
itemid = PageGetItemId(page, offnum);
itup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
blkno = BTreeTupleGetDownLink(itup);
par_blkno = BufferGetBlockNumber(*bufP);
* low) makes bounds invalid.
*
* Caller is responsible for invalidating bounds when it modifies the page
- * before calling here a second time.
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
*/
OffsetNumber
_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Assert(P_ISLEAF(opaque));
Assert(!key->nextkey);
+ Assert(insertstate->postingoff == 0);
if (!insertstate->bounds_valid)
{
if (result != 0)
stricthigh = high;
}
+
+ /*
+ * If tuple at offset located by binary search is a posting list whose
+ * TID range overlaps with caller's scantid, perform posting list
+ * binary search to set postingoff for caller. Caller must split the
+ * posting list when postingoff is set. This should happen
+ * infrequently.
+ */
+ if (unlikely(result == 0 && key->scantid != NULL))
+ insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
}
/*
return low;
}
+/*----------
+ * _bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+ IndexTuple itup;
+ ItemId itemid;
+ int low,
+ high,
+ mid,
+ res;
+
+ /*
+ * If this isn't a posting tuple, then the index must be corrupt (if it is
+ * an ordinary non-pivot tuple then there must be an existing tuple with a
+ * heap TID that equals inserter's new heap TID/scantid). Defensively
+ * check that tuple is a posting list tuple whose posting list range
+ * includes caller's scantid.
+ *
+ * (This is also needed because contrib/amcheck's rootdescend option needs
+ * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+ */
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ if (!BTreeTupleIsPosting(itup))
+ return 0;
+
+ Assert(key->heapkeyspace && key->allequalimage);
+
+ /*
+ * In the event that posting list tuple has LP_DEAD bit set, indicate this
+ * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
+ * second call to _bt_binsrch_insert() can take place when its caller has
+ * removed the dead item.
+ */
+ if (ItemIdIsDead(itemid))
+ return -1;
+
+ /* "high" is past end of posting list for loop invariant */
+ low = 0;
+ high = BTreeTupleGetNPosting(itup);
+ Assert(high >= 2);
+
+ while (high > low)
+ {
+ mid = low + ((high - low) / 2);
+ res = ItemPointerCompare(key->scantid,
+ BTreeTupleGetPostingN(itup, mid));
+
+ if (res > 0)
+ low = mid + 1;
+ else if (res < 0)
+ high = mid;
+ else
+ return mid;
+ }
+
+ /* Exact match not found */
+ return low;
+}
+
/*----------
* _bt_compare() -- Compare insertion-type scankey to tuple on a page.
*
* <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* >0 if scankey > tuple at offnum.
- * NULLs in the keys are treated as sortable values. Therefore
- * "equality" does not necessarily mean that the item should be
- * returned to the caller as a matching key!
+ *
+ * NULLs in the keys are treated as sortable values. Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key. Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid. There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
*
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
* "minus infinity": this routine will always claim it is less than the
ScanKey scankey;
int ncmpkey;
int ntupatts;
+ int32 result;
Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
ncmpkey = Min(ntupatts, key->keysz);
Assert(key->heapkeyspace || ncmpkey == key->keysz);
+ Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
scankey = key->scankeys;
for (int i = 1; i <= ncmpkey; i++)
{
Datum datum;
bool isNull;
- int32 result;
datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
if (heapTid == NULL)
return 1;
+ /*
+ * Scankey must be treated as equal to a posting list tuple if its scantid
+ * value falls within the range of the posting list. In all other cases
+ * there can only be a single heap TID value, which is compared directly
+ * with scantid.
+ */
Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
- return ItemPointerCompare(key->scantid, heapTid);
+ result = ItemPointerCompare(key->scantid, heapTid);
+ if (result <= 0 || !BTreeTupleIsPosting(itup))
+ return result;
+ else
+ {
+ result = ItemPointerCompare(key->scantid,
+ BTreeTupleGetMaxHeapTID(itup));
+ if (result > 0)
+ return 1;
+ }
+
+ return 0;
}
/*
}
/* Initialize remaining insertion scan key fields */
- inskey.heapkeyspace = _bt_heapkeyspace(rel);
+ _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
inskey.anynullkeys = false; /* unused */
inskey.nextkey = nextkey;
inskey.pivotsearch = false;
if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
{
- /* tuple passes all scan key conditions, so remember it */
- _bt_saveitem(so, itemIndex, offnum, itup);
- itemIndex++;
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ itemIndex++;
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID
+ */
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ itemIndex++;
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ itemIndex++;
+ }
+ }
}
/* When !continuescan, there can't be any more matches, so stop */
if (!continuescan)
if (!continuescan)
so->currPos.moreRight = false;
- Assert(itemIndex <= MaxIndexTuplesPerPage);
+ Assert(itemIndex <= MaxTIDsPerBTreePage);
so->currPos.firstItem = 0;
so->currPos.lastItem = itemIndex - 1;
so->currPos.itemIndex = 0;
else
{
/* load items[] in descending order */
- itemIndex = MaxIndexTuplesPerPage;
+ itemIndex = MaxTIDsPerBTreePage;
offnum = Min(offnum, maxoff);
&continuescan);
if (passes_quals && tuple_alive)
{
- /* tuple passes all scan key conditions, so remember it */
- itemIndex--;
- _bt_saveitem(so, itemIndex, offnum, itup);
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ itemIndex--;
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID.
+ *
+ * Note that we deliberately save/return items from
+ * posting lists in ascending heap TID order for backwards
+ * scans. This allows _bt_killitems() to make a
+ * consistent assumption about the order of items
+ * associated with the same posting list tuple.
+ */
+ itemIndex--;
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ itemIndex--;
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ }
+ }
}
if (!continuescan)
{
Assert(itemIndex >= 0);
so->currPos.firstItem = itemIndex;
- so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
- so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+ so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+ so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
}
return (so->currPos.firstItem <= so->currPos.lastItem);
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
}
}
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first. Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, IndexTuple itup)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ Assert(BTreeTupleIsPosting(itup));
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+ if (so->currTuples)
+ {
+ /* Save base IndexTuple (truncate posting list) */
+ IndexTuple base;
+ Size itupsz = BTreeTupleGetPostingOffset(itup);
+
+ itupsz = MAXALIGN(itupsz);
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+ memcpy(base, itup, itupsz);
+ /* Defensively reduce work area index tuple header size */
+ base->t_info &= ~INDEX_SIZE_MASK;
+ base->t_info |= itupsz;
+ so->currPos.nextTupleOffset += itupsz;
+
+ return currItem->tupleOffset;
+ }
+
+ return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple. Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+
+ /*
+ * Have index-only scans return the same base IndexTuple for every TID
+ * that originates from the same posting list
+ */
+ if (so->currTuples)
+ currItem->tupleOffset = tupleOffset;
+}
+
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
BlockNumber btps_blkno; /* block # to write this page at */
IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
OffsetNumber btps_lastoff; /* last item offset loaded */
+ Size btps_lastextra; /* last item's extra posting list space */
uint32 btps_level; /* tree level (0 = leaf) */
Size btps_full; /* "full" if less than this much free space */
struct BTPageState *btps_next; /* link to parent level, if any */
static void _bt_sortaddtup(Page page, Size itemsize,
IndexTuple itup, OffsetNumber itup_off);
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
- IndexTuple itup);
+ IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+ BTPageState *state,
+ BTDedupState dstate);
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
static void _bt_load(BTWriteState *wstate,
BTSpool *btspool, BTSpool *btspool2);
wstate.heap = btspool->heap;
wstate.index = btspool->index;
wstate.inskey = _bt_mkscankey(wstate.index, NULL);
+ /* _bt_mkscankey() won't set allequalimage without metapage */
+ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
/*
* We need to log index creation in WAL iff WAL archiving/streaming is
state->btps_lowkey = NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
state->btps_lastoff = P_HIKEY;
+ state->btps_lastextra = 0;
state->btps_level = level;
/* set "full" threshold based on level. See notes at head of file. */
if (level > 0)
}
/*----------
- * Add an item to a disk page from the sort output.
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
*
* We must be careful to observe the page layout conventions of nbtsearch.c:
* - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
* the truncated high key at offset 1.
*
* 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any. This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off. Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page). Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
*----------
*/
static void
-_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+ Size truncextra)
{
Page npage;
BlockNumber nblkno;
OffsetNumber last_off;
+ Size last_truncextra;
Size pgspc;
Size itupsz;
bool isleaf;
npage = state->btps_page;
nblkno = state->btps_blkno;
last_off = state->btps_lastoff;
+ last_truncextra = state->btps_lastextra;
+ state->btps_lastextra = truncextra;
pgspc = PageGetFreeSpace(npage);
itupsz = IndexTupleSize(itup);
* page. Disregard fillfactor and insert on "full" current page if we
* don't have the minimum number of items yet. (Note that we deliberately
* assume that suffix truncation neither enlarges nor shrinks new high key
- * when applying soft limit.)
+ * when applying soft limit, except when last tuple has a posting list.)
*/
if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
- (pgspc < state->btps_full && last_off > P_FIRSTKEY))
+ (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
{
/*
* Finish off the page and write it out.
* We don't try to bias our choice of split point to make it more
* likely that _bt_truncate() can truncate away more attributes,
* whereas the split point used within _bt_split() is chosen much
- * more delicately. Suffix truncation is mostly useful because it
- * improves space utilization for workloads with random
- * insertions. It doesn't seem worthwhile to add logic for
- * choosing a split point here for a benefit that is bound to be
- * much smaller.
+ * more delicately. Even still, the lastleft and firstright
+ * tuples passed to _bt_truncate() here are at least not fully
+ * equal to each other when deduplication is used, unless there is
+ * a large group of duplicates (also, unique index builds usually
+ * have few or no spool2 duplicates). When the split point is
+ * between two unequal tuples, _bt_truncate() will avoid including
+ * a heap TID in the new high key, which is the most important
+ * benefit of suffix truncation.
*
* Overwrite the old item with new truncated high key directly.
* oitup is already located at the physical beginning of tuple
Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
!P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
- _bt_buildadd(wstate, state->btps_next, state->btps_lowkey);
+ _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
pfree(state->btps_lowkey);
/*
state->btps_lastoff = last_off;
}
+/*
+ * Finalize pending posting list tuple, and add it to the index. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd().
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+ BTDedupState dstate)
+{
+ Assert(dstate->nitems > 0);
+
+ if (dstate->nitems == 1)
+ _bt_buildadd(wstate, state, dstate->base, 0);
+ else
+ {
+ IndexTuple postingtuple;
+ Size truncextra;
+
+ /* form a tuple with a posting list */
+ postingtuple = _bt_form_posting(dstate->base,
+ &