In basebackup.c, refactor to create read_file_data_into_buffer.

author Robert Haas <rhaas@postgresql.org>

Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)

committer Robert Haas <rhaas@postgresql.org>

Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)
author Robert Haas <rhaas@postgresql.org>
Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)
committer Robert Haas <rhaas@postgresql.org>
Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c

index 56e020732b832a143f30c3a450b8979799f0d5dc..7d025bcf3822d5752f1171bcce511efefdf84bc9 100644 (file)
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -83,6 +83,12 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
  static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                                          struct stat *statbuf, bool missing_ok, Oid dboid,
                                          backup_manifest_info *manifest, const char *spcoid);
+static off_t read_file_data_into_buffer(bbsink *sink,
+                                                                               const char *readfilename, int fd,
+                                                                               off_t offset, size_t length,
+                                                                               BlockNumber blkno,
+                                                                               bool verify_checksum,
+                                                                               int *checksum_failures);
  static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
                                                                  BlockNumber blkno,
                                                                  uint16 *expected_checksum);
@@ -1490,9 +1496,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
         BlockNumber blkno = 0;
         int                     checksum_failures = 0;
         off_t           cnt;
-       int                     i;
-       pgoff_t         len = 0;
-       char       *page;
+       pgoff_t         bytes_done = 0;
         int                     segmentno = 0;
         char       *segmentpath;
         bool            verify_checksum = false;
@@ -1514,6 +1518,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
  
         _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
  
+       /*
+        * Checksums are verified in multiples of BLCKSZ, so the buffer length
+        * should be a multiple of the block size as well.
+        */
+       Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
+
         if (!noverify_checksums && DataChecksumsEnabled())
         {
                 char       *filename;
@@ -1551,23 +1561,21 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
          * for a base backup we can ignore such extended data. It will be restored
          * from WAL.
          */
-       while (len < statbuf->st_size)
+       while (bytes_done < statbuf->st_size)
         {
-               size_t          remaining = statbuf->st_size - len;
+               size_t          remaining = statbuf->st_size - bytes_done;
  
                 /* Try to read some more data. */
-               cnt = basebackup_read_file(fd, sink->bbs_buffer,
-                                                                  Min(sink->bbs_buffer_length, remaining),
-                                                                  len, readfilename, true);
+               cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done,
+                                                                                remaining,
+                                                                                blkno + segmentno * RELSEG_SIZE,
+                                                                                verify_checksum,
+                                                                                &checksum_failures);
  
                 /*
-                * The checksums are verified at block level, so we iterate over the
-                * buffer in chunks of BLCKSZ, after making sure that
-                * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
-                * BLCKSZ bytes.
+                * If the amount of data we were able to read was not a multiple of
+                * BLCKSZ, we cannot verify checksums, which are block-level.
                  */
-               Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
-
                 if (verify_checksum && (cnt % BLCKSZ != 0))
                 {
                         ereport(WARNING,
@@ -1578,84 +1586,6 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                         verify_checksum = false;
                 }
  
-               if (verify_checksum)
-               {
-                       for (i = 0; i < cnt / BLCKSZ; i++)
-                       {
-                               int                     reread_cnt;
-                               uint16          expected_checksum;
-
-                               page = sink->bbs_buffer + BLCKSZ * i;
-
-                               /* If the page is OK, go on to the next one. */
-                               if (verify_page_checksum(page, sink->bbs_state->startptr,
-                                                                                blkno + i + segmentno * RELSEG_SIZE,
-                                                                                &expected_checksum))
-                                       continue;
-
-                               /*
-                                * Retry the block on the first failure.  It's possible that
-                                * we read the first 4K page of the block just before postgres
-                                * updated the entire block so it ends up looking torn to us.
-                                * If, before we retry the read, the concurrent write of the
-                                * block finishes, the page LSN will be updated and we'll
-                                * realize that we should ignore this block.
-                                *
-                                * There's no guarantee that this will actually happen,
-                                * though: the torn write could take an arbitrarily long time
-                                * to complete. Retrying multiple times wouldn't fix this
-                                * problem, either, though it would reduce the chances of it
-                                * happening in practice. The only real fix here seems to be
-                                * to have some kind of interlock that allows us to wait until
-                                * we can be certain that no write to the block is in
-                                * progress. Since we don't have any such thing right now, we
-                                * just do this and hope for the best.
-                                */
-                               reread_cnt =
-                                       basebackup_read_file(fd,
-                                                                                sink->bbs_buffer + BLCKSZ * i,
-                                                                                BLCKSZ, len + BLCKSZ * i,
-                                                                                readfilename,
-                                                                                false);
-                               if (reread_cnt == 0)
-                               {
-                                       /*
-                                        * If we hit end-of-file, a concurrent truncation must
-                                        * have occurred, so break out of this loop just as if the
-                                        * initial fread() returned 0. We'll drop through to the
-                                        * same code that handles that case. (We must fix up cnt
-                                        * first, though.)
-                                        */
-                                       cnt = BLCKSZ * i;
-                                       break;
-                               }
-
-                               /* If the page now looks OK, go on to the next one. */
-                               if (verify_page_checksum(page, sink->bbs_state->startptr,
-                                                                                blkno + i + segmentno * RELSEG_SIZE,
-                                                                                &expected_checksum))
-                                       continue;
-
-                               /* Handle checksum failure. */
-                               checksum_failures++;
-                               if (checksum_failures <= 5)
-                                       ereport(WARNING,
-                                                       (errmsg("checksum verification failed in "
-                                                                       "file \"%s\", block %u: calculated "
-                                                                       "%X but expected %X",
-                                                                       readfilename, blkno + i, expected_checksum,
-                                                                       ((PageHeader) page)->pd_checksum)));
-                               if (checksum_failures == 5)
-                                       ereport(WARNING,
-                                                       (errmsg("further checksum verification "
-                                                                       "failures in file \"%s\" will not "
-                                                                       "be reported", readfilename)));
-                       }
-
-                       /* Update block number for next pass through the outer loop. */
-                       blkno += i;
-               }
-
                 /*
                  * If we hit end-of-file, a concurrent truncation must have occurred.
                  * That's not an error condition, because WAL replay will fix things
@@ -1664,6 +1594,10 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                 if (cnt == 0)
                         break;
  
+               /* Update block number and # of bytes done for next loop iteration. */
+               blkno += cnt / BLCKSZ;
+               bytes_done += cnt;
+
                 /* Archive the data we just read. */
                 bbsink_archive_contents(sink, cnt);
  
@@ -1671,14 +1605,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                 if (pg_checksum_update(&checksum_ctx,
                                                            (uint8 *) sink->bbs_buffer, cnt) < 0)
                         elog(ERROR, "could not update checksum of base backup");
-
-               len += cnt;
         }
  
         /* If the file was truncated while we were sending it, pad it with zeros */
-       while (len < statbuf->st_size)
+       while (bytes_done < statbuf->st_size)
         {
-               size_t          remaining = statbuf->st_size - len;
+               size_t          remaining = statbuf->st_size - bytes_done;
                 size_t          nbytes = Min(sink->bbs_buffer_length, remaining);
  
                 MemSet(sink->bbs_buffer, 0, nbytes);
@@ -1687,7 +1619,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
                                                            nbytes) < 0)
                         elog(ERROR, "could not update checksum of base backup");
                 bbsink_archive_contents(sink, nbytes);
-               len += nbytes;
+               bytes_done += nbytes;
         }
  
         /*
@@ -1695,7 +1627,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
          * of data is probably not worth throttling, and is not checksummed
          * because it's not actually part of the file.)
          */
-       _tarWritePadding(sink, len);
+       _tarWritePadding(sink, bytes_done);
  
         CloseTransientFile(fd);
  
@@ -1718,6 +1650,109 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
         return true;
  }
  
+/*
+ * Read some more data from the file into the bbsink's buffer, verifying
+ * checksums as required.
+ *
+ * 'offset' is the file offset from which we should begin to read, and
+ * 'length' is the amount of data that should be read. The actual amount
+ * of data read will be less than the requested amount if the bbsink's
+ * buffer isn't big enough to hold it all, or if the underlying file has
+ * been truncated. The return value is the number of bytes actually read.
+ *
+ * 'blkno' is the block number of the first page in the bbsink's buffer
+ * relative to the start of the relation.
+ *
+ * 'verify_checksum' indicates whether we should try to verify checksums
+ * for the blocks we read. If we do this, we'll update *checksum_failures
+ * and issue warnings as appropriate.
+ */
+static off_t
+read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
+                                                  off_t offset, size_t length, BlockNumber blkno,
+                                                  bool verify_checksum, int *checksum_failures)
+{
+       off_t           cnt;
+       int                     i;
+       char       *page;
+
+       /* Try to read some more data. */
+       cnt = basebackup_read_file(fd, sink->bbs_buffer,
+                                                          Min(sink->bbs_buffer_length, length),
+                                                          offset, readfilename, true);
+
+       /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
+       if (!verify_checksum || (cnt % BLCKSZ) != 0)
+               return cnt;
+
+       /* Verify checksum for each block. */
+       for (i = 0; i < cnt / BLCKSZ; i++)
+       {
+               int                     reread_cnt;
+               uint16          expected_checksum;
+
+               page = sink->bbs_buffer + BLCKSZ * i;
+
+               /* If the page is OK, go on to the next one. */
+               if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
+                                                                &expected_checksum))
+                       continue;
+
+               /*
+                * Retry the block on the first failure.  It's possible that we read
+                * the first 4K page of the block just before postgres updated the
+                * entire block so it ends up looking torn to us. If, before we retry
+                * the read, the concurrent write of the block finishes, the page LSN
+                * will be updated and we'll realize that we should ignore this block.
+                *
+                * There's no guarantee that this will actually happen, though: the
+                * torn write could take an arbitrarily long time to complete.
+                * Retrying multiple times wouldn't fix this problem, either, though
+                * it would reduce the chances of it happening in practice. The only
+                * real fix here seems to be to have some kind of interlock that
+                * allows us to wait until we can be certain that no write to the
+                * block is in progress. Since we don't have any such thing right now,
+                * we just do this and hope for the best.
+                */
+               reread_cnt =
+                       basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
+                                                                BLCKSZ, offset + BLCKSZ * i,
+                                                                readfilename, false);
+               if (reread_cnt == 0)
+               {
+                       /*
+                        * If we hit end-of-file, a concurrent truncation must have
+                        * occurred, so reduce cnt to reflect only the blocks already
+                        * processed and break out of this loop.
+                        */
+                       cnt = BLCKSZ * i;
+                       break;
+               }
+
+               /* If the page now looks OK, go on to the next one. */
+               if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
+                                                                &expected_checksum))
+                       continue;
+
+               /* Handle checksum failure. */
+               (*checksum_failures)++;
+               if (*checksum_failures <= 5)
+                       ereport(WARNING,
+                                       (errmsg("checksum verification failed in "
+                                                       "file \"%s\", block %u: calculated "
+                                                       "%X but expected %X",
+                                                       readfilename, blkno + i, expected_checksum,
+                                                       ((PageHeader) page)->pd_checksum)));
+               if (*checksum_failures == 5)
+                       ereport(WARNING,
+                                       (errmsg("further checksum verification "
+                                                       "failures in file \"%s\" will not "
+                                                       "be reported", readfilename)));
+       }
+
+       return cnt;
+}
+
  /*
   * Try to verify the checksum for the provided page, if it seems appropriate
   * to do so.
author	Robert Haas <rhaas@postgresql.org>
	Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)
committer	Robert Haas <rhaas@postgresql.org>
	Tue, 3 Oct 2023 15:00:40 +0000 (11:00 -0400)