Optimize pg_visibility with read streams.
authorNoah Misch <noah@leadboat.com>
Tue, 10 Sep 2024 22:21:33 +0000 (15:21 -0700)
committerNoah Misch <noah@leadboat.com>
Tue, 10 Sep 2024 22:21:33 +0000 (15:21 -0700)
We've measured 5% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
The area lacked test coverage, so close that gap.

Nazir Bilal Yavuz

Discussion: https://postgr.es/m/CAN55FZ1_Ru3XpMgTwsU67FTH2fs_FrRROmb7x6zs+F44QBEiww@mail.gmail.com
Discussion: https://postgr.es/m/CAEudQAozv3wTY5TV2t29JcwPydbmKbiWQkZD42S2OgzdixPMDQ@mail.gmail.com

contrib/pg_visibility/meson.build
contrib/pg_visibility/pg_visibility.c
contrib/pg_visibility/t/002_corrupt_vm.pl [new file with mode: 0644]

index f3c1263313a250abf08144d540b58ff7b49c75fc..609fc5f9f3edf86ec9ada5549f3536513bcee60c 100644 (file)
@@ -36,6 +36,7 @@ tests += {
   'tap': {
     'tests': [
       't/001_concurrent_transaction.pl',
+      't/002_corrupt_vm.pl',
     ],
   },
 }
index 773ba92e454fd288fccc025cb333f27ffc86a376..724122b1bc584d157a48cfef3c5508f7e19c7e64 100644 (file)
@@ -21,6 +21,7 @@
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
@@ -41,6 +42,17 @@ typedef struct corrupt_items
        ItemPointer tids;
 } corrupt_items;
 
+/* for collect_corrupt_items_read_stream_next_block */
+struct collect_corrupt_items_read_stream_private
+{
+       bool            all_frozen;
+       bool            all_visible;
+       BlockNumber current_blocknum;
+       BlockNumber last_exclusive;
+       Relation        rel;
+       Buffer          vmbuffer;
+};
+
 PG_FUNCTION_INFO_V1(pg_visibility_map);
 PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
 PG_FUNCTION_INFO_V1(pg_visibility);
@@ -478,6 +490,8 @@ collect_visibility_data(Oid relid, bool include_pd)
        BlockNumber blkno;
        Buffer          vmbuffer = InvalidBuffer;
        BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+       BlockRangeReadStreamPrivate p;
+       ReadStream *stream = NULL;
 
        rel = relation_open(relid, AccessShareLock);
 
@@ -489,6 +503,20 @@ collect_visibility_data(Oid relid, bool include_pd)
        info->next = 0;
        info->count = nblocks;
 
+       /* Create a stream if reading main fork. */
+       if (include_pd)
+       {
+               p.current_blocknum = 0;
+               p.last_exclusive = nblocks;
+               stream = read_stream_begin_relation(READ_STREAM_FULL,
+                                                                                       bstrategy,
+                                                                                       rel,
+                                                                                       MAIN_FORKNUM,
+                                                                                       block_range_read_stream_cb,
+                                                                                       &p,
+                                                                                       0);
+       }
+
        for (blkno = 0; blkno < nblocks; ++blkno)
        {
                int32           mapbits;
@@ -513,8 +541,7 @@ collect_visibility_data(Oid relid, bool include_pd)
                        Buffer          buffer;
                        Page            page;
 
-                       buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-                                                                               bstrategy);
+                       buffer = read_stream_next_buffer(stream, NULL);
                        LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
                        page = BufferGetPage(buffer);
@@ -525,6 +552,12 @@ collect_visibility_data(Oid relid, bool include_pd)
                }
        }
 
+       if (include_pd)
+       {
+               Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+               read_stream_end(stream);
+       }
+
        /* Clean up. */
        if (vmbuffer != InvalidBuffer)
                ReleaseBuffer(vmbuffer);
@@ -610,6 +643,38 @@ GetStrictOldestNonRemovableTransactionId(Relation rel)
        }
 }
 
+/*
+ * Callback function to get next block for read stream object used in
+ * collect_corrupt_items() function.
+ */
+static BlockNumber
+collect_corrupt_items_read_stream_next_block(ReadStream *stream,
+                                                                                        void *callback_private_data,
+                                                                                        void *per_buffer_data)
+{
+       struct collect_corrupt_items_read_stream_private *p = callback_private_data;
+
+       for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
+       {
+               bool            check_frozen = false;
+               bool            check_visible = false;
+
+               /* Make sure we are interruptible. */
+               CHECK_FOR_INTERRUPTS();
+
+               if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
+                       check_frozen = true;
+               if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
+                       check_visible = true;
+               if (!check_visible && !check_frozen)
+                       continue;
+
+               return p->current_blocknum++;
+       }
+
+       return InvalidBlockNumber;
+}
+
 /*
  * Returns a list of items whose visibility map information does not match
  * the status of the tuples on the page.
@@ -628,12 +693,13 @@ static corrupt_items *
 collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 {
        Relation        rel;
-       BlockNumber nblocks;
        corrupt_items *items;
-       BlockNumber blkno;
        Buffer          vmbuffer = InvalidBuffer;
        BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
        TransactionId OldestXmin = InvalidTransactionId;
+       struct collect_corrupt_items_read_stream_private p;
+       ReadStream *stream;
+       Buffer          buffer;
 
        rel = relation_open(relid, AccessShareLock);
 
@@ -643,8 +709,6 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
        if (all_visible)
                OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 
-       nblocks = RelationGetNumberOfBlocks(rel);
-
        /*
         * Guess an initial array size. We don't expect many corrupted tuples, so
         * start with a small array.  This function uses the "next" field to track
@@ -658,34 +722,38 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
        items->count = 64;
        items->tids = palloc(items->count * sizeof(ItemPointerData));
 
+       p.current_blocknum = 0;
+       p.last_exclusive = RelationGetNumberOfBlocks(rel);
+       p.rel = rel;
+       p.vmbuffer = InvalidBuffer;
+       p.all_frozen = all_frozen;
+       p.all_visible = all_visible;
+       stream = read_stream_begin_relation(READ_STREAM_FULL,
+                                                                               bstrategy,
+                                                                               rel,
+                                                                               MAIN_FORKNUM,
+                                                                               collect_corrupt_items_read_stream_next_block,
+                                                                               &p,
+                                                                               0);
+
        /* Loop over every block in the relation. */
-       for (blkno = 0; blkno < nblocks; ++blkno)
+       while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
        {
-               bool            check_frozen = false;
-               bool            check_visible = false;
-               Buffer          buffer;
+               bool            check_frozen = all_frozen;
+               bool            check_visible = all_visible;
                Page            page;
                OffsetNumber offnum,
                                        maxoff;
+               BlockNumber blkno;
 
                /* Make sure we are interruptible. */
                CHECK_FOR_INTERRUPTS();
 
-               /* Use the visibility map to decide whether to check this page. */
-               if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer))
-                       check_frozen = true;
-               if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-                       check_visible = true;
-               if (!check_visible && !check_frozen)
-                       continue;
-
-               /* Read and lock the page. */
-               buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-                                                                       bstrategy);
                LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
                page = BufferGetPage(buffer);
                maxoff = PageGetMaxOffsetNumber(page);
+               blkno = BufferGetBlockNumber(buffer);
 
                /*
                 * The visibility map bits might have changed while we were acquiring
@@ -778,10 +846,13 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 
                UnlockReleaseBuffer(buffer);
        }
+       read_stream_end(stream);
 
        /* Clean up. */
        if (vmbuffer != InvalidBuffer)
                ReleaseBuffer(vmbuffer);
+       if (p.vmbuffer != InvalidBuffer)
+               ReleaseBuffer(p.vmbuffer);
        relation_close(rel, AccessShareLock);
 
        /*
diff --git a/contrib/pg_visibility/t/002_corrupt_vm.pl b/contrib/pg_visibility/t/002_corrupt_vm.pl
new file mode 100644 (file)
index 0000000..a30d7c7
--- /dev/null
@@ -0,0 +1,109 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Check that pg_check_visible() and pg_check_frozen() report correct TIDs for
+# corruption.
+use strict;
+use warnings FATAL => 'all';
+use File::Copy;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+# Anything holding a snapshot, including auto-analyze of pg_proc, could stop
+# VACUUM from updating the visibility map.
+$node->append_conf('postgresql.conf', 'autovacuum=off');
+$node->start;
+
+my $blck_size = $node->safe_psql("postgres", "SHOW block_size;");
+
+# Create a sample table with at least 10 pages and then run VACUUM. 10 is
+# selected manually as it is big enough to select 5 random tuples from the
+# relation.
+$node->safe_psql(
+       'postgres', qq(
+               CREATE EXTENSION pg_visibility;
+               CREATE TABLE corruption_test
+                       WITH (autovacuum_enabled = false) AS
+                       SELECT
+                               i,
+                               repeat('a', 10) AS data
+                       FROM
+                               generate_series(1, $blck_size) i;
+               VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) corruption_test;
+));
+
+# VACUUM is run, it is safe to get the number of pages.
+my $npages = $node->safe_psql(
+       "postgres",
+       "SELECT relpages FROM pg_class
+               WHERE relname = 'corruption_test';"
+);
+ok($npages >= 10, 'table has at least 10 pages');
+
+my $file = $node->safe_psql("postgres",
+       "SELECT pg_relation_filepath('corruption_test');");
+
+# Delete the first block to make sure that it will be skipped as it is
+# not visible nor frozen.
+$node->safe_psql(
+       "postgres",
+       "DELETE FROM corruption_test
+               WHERE (ctid::text::point)[0] = 0;"
+);
+
+# Copy visibility map.
+$node->stop;
+my $vm_file = $node->data_dir . '/' . $file . '_vm';
+copy("$vm_file", "${vm_file}_temp");
+$node->start;
+
+# Select 5 random tuples that are starting from the second block of the
+# relation. The first block is skipped because it is deleted above.
+my $tuples = $node->safe_psql(
+       "postgres",
+       "SELECT ctid FROM (
+               SELECT ctid FROM corruption_test
+                       WHERE (ctid::text::point)[0] != 0
+                       ORDER BY random() LIMIT 5)
+               ORDER BY ctid ASC;"
+);
+
+# Do the changes below to use tuples in the query.
+# "\n" -> ","
+# "(" -> "'("
+# ")" -> ")'"
+(my $tuples_query = $tuples) =~ s/\n/,/g;
+$tuples_query =~ s/\(/\'\(/g;
+$tuples_query =~ s/\)/\)\'/g;
+
+$node->safe_psql(
+       "postgres",
+       "DELETE FROM corruption_test
+               WHERE ctid in ($tuples_query);"
+);
+
+# Overwrite visibility map with the old one.
+$node->stop;
+move("${vm_file}_temp", "$vm_file");
+$node->start;
+
+my $result = $node->safe_psql(
+       "postgres",
+       "SELECT DISTINCT t_ctid
+               FROM pg_check_visible('corruption_test')
+               ORDER BY t_ctid ASC;"
+);
+is($result, $tuples, 'pg_check_visible must report tuples as corrupted');
+
+$result = $node->safe_psql(
+       "postgres",
+       "SELECT DISTINCT t_ctid
+               FROM pg_check_frozen('corruption_test')
+               ORDER BY t_ctid ASC;"
+);
+is($result, $tuples, 'pg_check_frozen must report tuples as corrupted');
+
+$node->stop;
+done_testing();