Snapshot caching, introduced in
623a9ba79b, did not increment
xactCompletionCount during subtransaction abort. That could lead to an older
snapshot being reused. That is, at least as far as I can see, not a
correctness issue (for MVCC snapshots there's no difference between "in
progress" and "aborted"). The only difference between the old and new
snapshots would be a newer ->xmax.
While HeapTupleSatisfiesMVCC makes the same visibility determination, reusing
the old snapshot leads HeapTupleSatisfiesMVCC to not set
HEAP_XMIN_INVALID. Which subsequently causes the kill_prior_tuple optimization
to not kick in (via HeapTupleIsSurelyDead() returning false). The performance
effects of doing the same index-lookups over and over again is how the issue
was discovered...
Fix the issue by incrementing xactCompletionCount in
XidCacheRemoveRunningXids. It already acquires ProcArrayLock exclusively,
making that an easy proposition.
Add a test to ensure that kill_prior_tuple prevents index growth when it
involves aborted subtransaction of the current transaction.
Author: Andres Freund
Discussion: https://postgr.es/m/
20210406043521.lopeo7bbigad3n6t@alap3.anarazel.de
Discussion: https://postgr.es/m/
20210317055718.v6qs3ltzrformqoa%40alap3.anarazel.de
*/
MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+ /*
+ * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+ * nobody can see it yet.
+ */
+
LWLockRelease(ProcArrayLock);
/* ShmemVariableCache->nextXid must be beyond any observed xid. */
/* Also advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid);
+ /* ... and xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
LWLockRelease(ProcArrayLock);
}
--- /dev/null
+--
+-- Verify that index scans encountering dead rows produced by an
+-- aborted subtransaction of the current transaction can utilize the
+-- kill_prio_tuple optimization
+--
+-- NB: The table size is currently *not* expected to stay the same, we
+-- don't have logic to trigger opportunistic pruning in cases like
+-- this.
+BEGIN;
+SET LOCAL enable_seqscan = false;
+SET LOCAL enable_indexonlyscan = false;
+SET LOCAL enable_bitmapscan = false;
+-- Can't easily use a unique index, since dead tuples can be found
+-- independent of the kill_prior_tuples optimization.
+CREATE TABLE clean_aborted_self(key int, data text);
+CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
+INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
+-- save index size from before the changes, for comparison
+SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
+DO $$
+BEGIN
+ -- iterate often enough to see index growth even on larger-than-default page sizes
+ FOR i IN 1..100 LOOP
+ BEGIN
+ -- perform index scan over all the inserted keys to get them to be seen as dead
+ IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
+ RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
+ END IF;
+ INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
+ -- just some error that's not normally thrown
+ RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
+ EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
+ END LOOP;
+END;$$;
+-- show sizes only if they differ
+SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
+WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
+ size_before | size_after
+-------------+------------
+(0 rows)
+
+ROLLBACK;
# geometry depends on point, lseg, box, path, polygon and circle
# horology depends on interval, timetz, timestamp, timestamptz
# ----------
-test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid
+test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc
# ----------
# These four each depend on the previous one
test: int8
test: oid
test: xid
+test: mvcc
test: float4
test: float8
test: bit
--- /dev/null
+--
+-- Verify that index scans encountering dead rows produced by an
+-- aborted subtransaction of the current transaction can utilize the
+-- kill_prio_tuple optimization
+--
+-- NB: The table size is currently *not* expected to stay the same, we
+-- don't have logic to trigger opportunistic pruning in cases like
+-- this.
+BEGIN;
+
+SET LOCAL enable_seqscan = false;
+SET LOCAL enable_indexonlyscan = false;
+SET LOCAL enable_bitmapscan = false;
+
+-- Can't easily use a unique index, since dead tuples can be found
+-- independent of the kill_prior_tuples optimization.
+CREATE TABLE clean_aborted_self(key int, data text);
+CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
+INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
+
+-- save index size from before the changes, for comparison
+SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
+
+DO $$
+BEGIN
+ -- iterate often enough to see index growth even on larger-than-default page sizes
+ FOR i IN 1..100 LOOP
+ BEGIN
+ -- perform index scan over all the inserted keys to get them to be seen as dead
+ IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
+ RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
+ END IF;
+ INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
+ -- just some error that's not normally thrown
+ RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
+ EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
+ END LOOP;
+END;$$;
+
+-- show sizes only if they differ
+SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
+WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
+
+ROLLBACK;