In basebackup.c, refactor to create verify_page_checksum.
authorRobert Haas <rhaas@postgresql.org>
Tue, 3 Oct 2023 14:37:20 +0000 (10:37 -0400)
committerRobert Haas <rhaas@postgresql.org>
Tue, 3 Oct 2023 14:37:20 +0000 (10:37 -0400)
If checksum verification fails for a particular page, we reread the
page and try one more time. The code that does this somewhat complex
and difficult to follow. Move some of the logic into a new function
and rearrange the code a bit to try to make it clearer. This way,
we don't need the block_retry Boolean, a couple of other variables
move from sendFile() into the new function, and some code is now less
deeply indented.

Patch by me, reviewed by David Steele.

Discussion: http://postgr.es/m/CA+TgmoYt5jXH4U6cu1dm9Oe2FTn1aae6hBNhZzJJjyjbE_zYig@mail.gmail.com

src/backend/backup/basebackup.c

index 5d66014499af58125a872d50956a332a0eac64fb..56e020732b832a143f30c3a450b8979799f0d5dc 100644 (file)
@@ -83,6 +83,9 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
 static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                     struct stat *statbuf, bool missing_ok, Oid dboid,
                     backup_manifest_info *manifest, const char *spcoid);
+static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
+                                BlockNumber blkno,
+                                uint16 *expected_checksum);
 static void sendFileWithContent(bbsink *sink, const char *filename,
                                const char *content,
                                backup_manifest_info *manifest);
@@ -1485,14 +1488,11 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 {
    int         fd;
    BlockNumber blkno = 0;
-   bool        block_retry = false;
-   uint16      checksum;
    int         checksum_failures = 0;
    off_t       cnt;
    int         i;
    pgoff_t     len = 0;
    char       *page;
-   PageHeader  phdr;
    int         segmentno = 0;
    char       *segmentpath;
    bool        verify_checksum = false;
@@ -1582,94 +1582,78 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
        {
            for (i = 0; i < cnt / BLCKSZ; i++)
            {
+               int         reread_cnt;
+               uint16      expected_checksum;
+
                page = sink->bbs_buffer + BLCKSZ * i;
 
+               /* If the page is OK, go on to the next one. */
+               if (verify_page_checksum(page, sink->bbs_state->startptr,
+                                        blkno + i + segmentno * RELSEG_SIZE,
+                                        &expected_checksum))
+                   continue;
+
                /*
-                * Only check pages which have not been modified since the
-                * start of the base backup. Otherwise, they might have been
-                * written only halfway and the checksum would not be valid.
-                * However, replaying WAL would reinstate the correct page in
-                * this case. We also skip completely new pages, since they
-                * don't have a checksum yet.
+                * Retry the block on the first failure.  It's possible that
+                * we read the first 4K page of the block just before postgres
+                * updated the entire block so it ends up looking torn to us.
+                * If, before we retry the read, the concurrent write of the
+                * block finishes, the page LSN will be updated and we'll
+                * realize that we should ignore this block.
+                *
+                * There's no guarantee that this will actually happen,
+                * though: the torn write could take an arbitrarily long time
+                * to complete. Retrying multiple times wouldn't fix this
+                * problem, either, though it would reduce the chances of it
+                * happening in practice. The only real fix here seems to be
+                * to have some kind of interlock that allows us to wait until
+                * we can be certain that no write to the block is in
+                * progress. Since we don't have any such thing right now, we
+                * just do this and hope for the best.
                 */
-               if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr)
+               reread_cnt =
+                   basebackup_read_file(fd,
+                                        sink->bbs_buffer + BLCKSZ * i,
+                                        BLCKSZ, len + BLCKSZ * i,
+                                        readfilename,
+                                        false);
+               if (reread_cnt == 0)
                {
-                   checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
-                   phdr = (PageHeader) page;
-                   if (phdr->pd_checksum != checksum)
-                   {
-                       /*
-                        * Retry the block on the first failure.  It's
-                        * possible that we read the first 4K page of the
-                        * block just before postgres updated the entire block
-                        * so it ends up looking torn to us. If, before we
-                        * retry the read, the concurrent write of the block
-                        * finishes, the page LSN will be updated and we'll
-                        * realize that we should ignore this block.
-                        *
-                        * There's no guarantee that this will actually
-                        * happen, though: the torn write could take an
-                        * arbitrarily long time to complete. Retrying
-                        * multiple times wouldn't fix this problem, either,
-                        * though it would reduce the chances of it happening
-                        * in practice. The only real fix here seems to be to
-                        * have some kind of interlock that allows us to wait
-                        * until we can be certain that no write to the block
-                        * is in progress. Since we don't have any such thing
-                        * right now, we just do this and hope for the best.
-                        */
-                       if (block_retry == false)
-                       {
-                           int         reread_cnt;
-
-                           /* Reread the failed block */
-                           reread_cnt =
-                               basebackup_read_file(fd,
-                                                    sink->bbs_buffer + BLCKSZ * i,
-                                                    BLCKSZ, len + BLCKSZ * i,
-                                                    readfilename,
-                                                    false);
-                           if (reread_cnt == 0)
-                           {
-                               /*
-                                * If we hit end-of-file, a concurrent
-                                * truncation must have occurred, so break out
-                                * of this loop just as if the initial fread()
-                                * returned 0. We'll drop through to the same
-                                * code that handles that case. (We must fix
-                                * up cnt first, though.)
-                                */
-                               cnt = BLCKSZ * i;
-                               break;
-                           }
-
-                           /* Set flag so we know a retry was attempted */
-                           block_retry = true;
-
-                           /* Reset loop to validate the block again */
-                           i--;
-                           continue;
-                       }
-
-                       checksum_failures++;
-
-                       if (checksum_failures <= 5)
-                           ereport(WARNING,
-                                   (errmsg("checksum verification failed in "
-                                           "file \"%s\", block %u: calculated "
-                                           "%X but expected %X",
-                                           readfilename, blkno, checksum,
-                                           phdr->pd_checksum)));
-                       if (checksum_failures == 5)
-                           ereport(WARNING,
-                                   (errmsg("further checksum verification "
-                                           "failures in file \"%s\" will not "
-                                           "be reported", readfilename)));
-                   }
+                   /*
+                    * If we hit end-of-file, a concurrent truncation must
+                    * have occurred, so break out of this loop just as if the
+                    * initial fread() returned 0. We'll drop through to the
+                    * same code that handles that case. (We must fix up cnt
+                    * first, though.)
+                    */
+                   cnt = BLCKSZ * i;
+                   break;
                }
-               block_retry = false;
-               blkno++;
+
+               /* If the page now looks OK, go on to the next one. */
+               if (verify_page_checksum(page, sink->bbs_state->startptr,
+                                        blkno + i + segmentno * RELSEG_SIZE,
+                                        &expected_checksum))
+                   continue;
+
+               /* Handle checksum failure. */
+               checksum_failures++;
+               if (checksum_failures <= 5)
+                   ereport(WARNING,
+                           (errmsg("checksum verification failed in "
+                                   "file \"%s\", block %u: calculated "
+                                   "%X but expected %X",
+                                   readfilename, blkno + i, expected_checksum,
+                                   ((PageHeader) page)->pd_checksum)));
+               if (checksum_failures == 5)
+                   ereport(WARNING,
+                           (errmsg("further checksum verification "
+                                   "failures in file \"%s\" will not "
+                                   "be reported", readfilename)));
            }
+
+           /* Update block number for next pass through the outer loop. */
+           blkno += i;
        }
 
        /*
@@ -1734,6 +1718,42 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
    return true;
 }
 
+/*
+ * Try to verify the checksum for the provided page, if it seems appropriate
+ * to do so.
+ *
+ * Returns true if verification succeeds or if we decide not to check it,
+ * and false if verification fails. When return false, it also sets
+ * *expected_checksum to the computed value.
+ */
+static bool
+verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
+                    uint16 *expected_checksum)
+{
+   PageHeader  phdr;
+   uint16      checksum;
+
+   /*
+    * Only check pages which have not been modified since the start of the
+    * base backup. Otherwise, they might have been written only halfway and
+    * the checksum would not be valid.  However, replaying WAL would
+    * reinstate the correct page in this case. We also skip completely new
+    * pages, since they don't have a checksum yet.
+    */
+   if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
+       return true;
+
+   /* Perform the actual checksum calculation. */
+   checksum = pg_checksum_page(page, blkno);
+
+   /* See whether it matches the value from the page. */
+   phdr = (PageHeader) page;
+   if (phdr->pd_checksum == checksum)
+       return true;
+   *expected_checksum = checksum;
+   return false;
+}
+
 static int64
 _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
                struct stat *statbuf, bool sizeonly)