Allow using copy_file_range in write_reconstructed_file
authorTomas Vondra <tomas.vondra@postgresql.org>
Fri, 5 Apr 2024 17:01:16 +0000 (19:01 +0200)
committerTomas Vondra <tomas.vondra@postgresql.org>
Fri, 5 Apr 2024 17:19:36 +0000 (19:19 +0200)
This commit allows using copy_file_range() for efficient combining of
data from multiple files, instead of simply reading/writing the blocks.
Depending on the filesystem and other factors (size of the increment,
distribution of modified blocks etc.) this may be faster than the
block-by-block copy, but more importantly it enables various features
provided by CoW filesystems.

If a checksum needs to be calculated for the file, the same strategy as
when copying whole files is used - copy_file_range is used to copy the
blocks, but the file is also read for the checksum calculation.

While the checksum calculation is rarely needed when cloning whole
files, when reconstructing the files from multiple backups it needs to
happen almost always (the only exception is when the user specified
--no-manifest).

Author: Tomas Vondra
Reviewed-by: Thomas Munro, Jakub Wartak, Robert Haas
Discussion: https://postgr.es/m/3024283a-7491-4240-80d0-421575f6bb23%40enterprisedb.com

src/bin/pg_combinebackup/reconstruct.c

index b083c5ce15bbf53ee5ab4f2fe1b6f8e711c9bd37..b059be8d0a60150b666c2209cf3ba251e2afcad1 100644 (file)
@@ -58,9 +58,14 @@ static void write_reconstructed_file(char *input_filename,
                                     rfile **sourcemap,
                                     off_t *offsetmap,
                                     pg_checksum_context *checksum_ctx,
+                                    CopyMethod copy_method,
                                     bool debug,
                                     bool dry_run);
 static void read_bytes(rfile *rf, void *buffer, unsigned length);
+static void write_block(int wfd, char *output_filename,
+                       uint8 *buffer,
+                       pg_checksum_context *checksum_ctx);
+static void read_block(rfile *s, off_t off, uint8 *buffer);
 
 /*
  * Reconstruct a full file from an incremental file and a chain of prior
@@ -325,7 +330,8 @@ reconstruct_from_incremental_file(char *input_filename,
    {
        write_reconstructed_file(input_filename, output_filename,
                                 block_length, sourcemap, offsetmap,
-                                &checksum_ctx, debug, dry_run);
+                                &checksum_ctx, copy_method,
+                                debug, dry_run);
        debug_reconstruction(n_prior_backups + 1, source, dry_run);
    }
 
@@ -535,6 +541,7 @@ write_reconstructed_file(char *input_filename,
                         rfile **sourcemap,
                         off_t *offsetmap,
                         pg_checksum_context *checksum_ctx,
+                        CopyMethod copy_method,
                         bool debug,
                         bool dry_run)
 {
@@ -622,7 +629,6 @@ write_reconstructed_file(char *input_filename,
    {
        uint8       buffer[BLCKSZ];
        rfile      *s = sourcemap[i];
-       int         wb;
 
        /* Update accounting information. */
        if (s == NULL)
@@ -646,38 +652,61 @@ write_reconstructed_file(char *input_filename,
             * uninitialized block, so just zero-fill it.
             */
            memset(buffer, 0, BLCKSZ);
-       }
-       else
-       {
-           int         rb;
 
-           /* Read the block from the correct source, except if dry-run. */
-           rb = pg_pread(s->fd, buffer, BLCKSZ, offsetmap[i]);
-           if (rb != BLCKSZ)
-           {
-               if (rb < 0)
-                   pg_fatal("could not read file \"%s\": %m", s->filename);
-               else
-                   pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
-                            s->filename, rb, BLCKSZ,
-                            (unsigned long long) offsetmap[i]);
-           }
+           /* Write out the block, update the checksum if needed. */
+           write_block(wfd, output_filename, buffer, checksum_ctx);
+
+           /* Nothing else to do for zero-filled blocks. */
+           continue;
        }
 
-       /* Write out the block. */
-       if ((wb = write(wfd, buffer, BLCKSZ)) != BLCKSZ)
+       /* Copy the block using the appropriate copy method. */
+       if (copy_method != COPY_METHOD_COPY_FILE_RANGE)
        {
-           if (wb < 0)
-               pg_fatal("could not write file \"%s\": %m", output_filename);
-           else
-               pg_fatal("could not write file \"%s\": wrote only %d of %d bytes",
-                        output_filename, wb, BLCKSZ);
+           /*
+            * Read the block from the correct source file, and then write it
+            * out, possibly with a checksum update.
+            */
+           read_block(s, offsetmap[i], buffer);
+           write_block(wfd, output_filename, buffer, checksum_ctx);
        }
+       else                    /* use copy_file_range */
+       {
+           /* copy_file_range modifies the offset, so use a local copy */
+           off_t       off = offsetmap[i];
+           size_t      nwritten = 0;
+
+           /*
+            * Retry until we've written all the bytes (the offset is updated
+            * by copy_file_range, and so is the wfd file offset).
+            */
+           do
+           {
+               int         wb;
+
+               wb = copy_file_range(s->fd, &off, wfd, NULL, BLCKSZ - nwritten, 0);
+
+               if (wb < 0)
+                   pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+                            input_filename, output_filename);
+
+               nwritten += wb;
+
+           } while (BLCKSZ > nwritten);
+
+           /*
+            * When checksum calculation not needed, we're done, otherwise
+            * read the block and pass it to the checksum calculation.
+            */
+           if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
+               continue;
+
+           read_block(s, offsetmap[i], buffer);
 
-       /* Update the checksum computation. */
-       if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
-           pg_fatal("could not update checksum of file \"%s\"",
-                    output_filename);
+           if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
+               pg_fatal("could not update checksum of file \"%s\"",
+                        output_filename);
+       }
    }
 
    /* Debugging output. */
@@ -693,3 +722,52 @@ write_reconstructed_file(char *input_filename,
    if (wfd >= 0 && close(wfd) != 0)
        pg_fatal("could not close \"%s\": %m", output_filename);
 }
+
+/*
+ * Write the block into the file (using the file descriptor), and
+ * if needed update the checksum calculation.
+ *
+ * The buffer is expected to contain BLCKSZ bytes. The filename is
+ * provided only for the error message.
+ */
+static void
+write_block(int fd, char *output_filename,
+           uint8 *buffer, pg_checksum_context *checksum_ctx)
+{
+   int         wb;
+
+   if ((wb = write(fd, buffer, BLCKSZ)) != BLCKSZ)
+   {
+       if (wb < 0)
+           pg_fatal("could not write file \"%s\": %m", output_filename);
+       else
+           pg_fatal("could not write file \"%s\": wrote only %d of %d bytes",
+                    output_filename, wb, BLCKSZ);
+   }
+
+   /* Update the checksum computation. */
+   if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
+       pg_fatal("could not update checksum of file \"%s\"",
+                output_filename);
+}
+
+/*
+ * Read a block of data (BLCKSZ bytes) into the the buffer.
+ */
+static void
+read_block(rfile *s, off_t off, uint8 *buffer)
+{
+   int         rb;
+
+   /* Read the block from the correct source, except if dry-run. */
+   rb = pg_pread(s->fd, buffer, BLCKSZ, off);
+   if (rb != BLCKSZ)
+   {
+       if (rb < 0)
+           pg_fatal("could not read file \"%s\": %m", s->filename);
+       else
+           pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
+                    s->filename, rb, BLCKSZ,
+                    (unsigned long long) off);
+   }
+}