Add smgrzeroextend(), FileZero(), FileFallocate()

author Andres Freund <andres@anarazel.de>

Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)

committer Andres Freund <andres@anarazel.de>

Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)
author Andres Freund <andres@anarazel.de>
Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)
committer Andres Freund <andres@anarazel.de>
Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 2ac365e97ccb10827d5b5467e061fb96e57ca377..a280a1e7be37cd80d93f823cda6f39e2ec9c4f9f 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info)
     return returnCode;
  }
  
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+   int         returnCode;
+   ssize_t     written;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+              file, VfdCache[file].fileName,
+              (int64) offset, (int64) amount));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return returnCode;
+
+   pgstat_report_wait_start(wait_event_info);
+   written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+   pgstat_report_wait_end();
+
+   if (written < 0)
+       return -1;
+   else if (written != amount)
+   {
+       /* if errno is unset, assume problem is no disk space */
+       if (errno == 0)
+           errno = ENOSPC;
+       return -1;
+   }
+
+   return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+   int         returnCode;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+              file, VfdCache[file].fileName,
+              (int64) offset, (int64) amount));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return -1;
+
+   pgstat_report_wait_start(wait_event_info);
+   returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
+   pgstat_report_wait_end();
+
+   if (returnCode == 0)
+       return 0;
+
+   /* for compatibility with %m printing etc */
+   errno = returnCode;
+
+   /*
+    * Return in cases of a "real" failure, if fallocate is not supported,
+    * fall through to the FileZero() backed implementation.
+    */
+   if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+       return -1;
+#endif
+
+   return FileZero(file, offset, amount, wait_event_info);
+}
+
  off_t
  FileSize(File file)
  {
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 352958e1feb6359ae6b63ef4528e503781ec7a7e..1c2d1405f8641b718eb127fd8016e3d377da9318 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -500,6 +500,114 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
  }
  
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ *
+ *     Similar to mdextend(), except the relation can be extended by multiple
+ *     blocks at once and the added blocks will be filled with zeroes.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+            BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+   MdfdVec    *v;
+   BlockNumber curblocknum = blocknum;
+   int         remblocks = nblocks;
+
+   Assert(nblocks > 0);
+
+   /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+   Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+   /*
+    * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+    * more --- we mustn't create a block whose number actually is
+    * InvalidBlockNumber or larger.
+    */
+   if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+       ereport(ERROR,
+               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                errmsg("cannot extend file \"%s\" beyond %u blocks",
+                       relpath(reln->smgr_rlocator, forknum),
+                       InvalidBlockNumber)));
+
+   while (remblocks > 0)
+   {
+       BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+       off_t       seekpos = (off_t) BLCKSZ * segstartblock;
+       int         numblocks;
+
+       if (segstartblock + remblocks > RELSEG_SIZE)
+           numblocks = RELSEG_SIZE - segstartblock;
+       else
+           numblocks = remblocks;
+
+       v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+       Assert(segstartblock < RELSEG_SIZE);
+       Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+       /*
+        * If available and useful, use posix_fallocate() (via FileAllocate())
+        * to extend the relation. That's often more efficient than using
+        * write(), as it commonly won't cause the kernel to allocate page
+        * cache space for the extended pages.
+        *
+        * However, we don't use FileAllocate() for small extensions, as it
+        * defeats delayed allocation on some filesystems. Not clear where
+        * that decision should be made though? For now just use a cutoff of
+        * 8, anything between 4 and 8 worked OK in some local testing.
+        */
+       if (numblocks > 8)
+       {
+           int         ret;
+
+           ret = FileFallocate(v->mdfd_vfd,
+                               seekpos, (off_t) BLCKSZ * numblocks,
+                               WAIT_EVENT_DATA_FILE_EXTEND);
+           if (ret != 0)
+           {
+               ereport(ERROR,
+                       errcode_for_file_access(),
+                       errmsg("could not extend file \"%s\" with FileFallocate(): %m",
+                              FilePathName(v->mdfd_vfd)),
+                       errhint("Check free disk space."));
+           }
+       }
+       else
+       {
+           int         ret;
+
+           /*
+            * Even if we don't want to use fallocate, we can still extend a
+            * bit more efficiently than writing each 8kB block individually.
+            * pg_pwrite_zeroes() (via FileZero()) uses
+            * pg_pwritev_with_retry() to avoid multiple writes or needing a
+            * zeroed buffer for the whole length of the extension.
+            */
+           ret = FileZero(v->mdfd_vfd,
+                          seekpos, (off_t) BLCKSZ * numblocks,
+                          WAIT_EVENT_DATA_FILE_EXTEND);
+           if (ret < 0)
+               ereport(ERROR,
+                       errcode_for_file_access(),
+                       errmsg("could not extend file \"%s\": %m",
+                              FilePathName(v->mdfd_vfd)),
+                       errhint("Check free disk space."));
+       }
+
+       if (!skipFsync && !SmgrIsTemp(reln))
+           register_dirty_segment(reln, forknum, v);
+
+       Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+       remblocks -= numblocks;
+       curblocknum += numblocks;
+   }
+}
+
  /*
   * mdopenfork() -- Open one fork of the specified relation.
   *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index dc466e541454bb94d28e4939c8c2aa23795db532..c37c246b77fd83e906f669f08422cb97b2bd87f5 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -50,6 +50,8 @@ typedef struct f_smgr
                                 bool isRedo);
     void        (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                                 BlockNumber blocknum, const void *buffer, bool skipFsync);
+   void        (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+                                   BlockNumber blocknum, int nblocks, bool skipFsync);
     bool        (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
                                   BlockNumber blocknum);
     void        (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
@@ -75,6 +77,7 @@ static const f_smgr smgrsw[] = {
         .smgr_exists = mdexists,
         .smgr_unlink = mdunlink,
         .smgr_extend = mdextend,
+       .smgr_zeroextend = mdzeroextend,
         .smgr_prefetch = mdprefetch,
         .smgr_read = mdread,
         .smgr_write = mdwrite,
@@ -507,6 +510,31 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
         reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
  }
  
+/*
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ *     Similar to smgrextend(), except the relation can be extended by
+ *     multiple blocks at once and the added blocks will be filled with
+ *     zeroes.
+ */
+void
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+              int nblocks, bool skipFsync)
+{
+   smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+                                            nblocks, skipFsync);
+
+   /*
+    * Normally we expect this to increase the fork size by nblocks, but if
+    * the cached value isn't as expected, just invalidate it so the next call
+    * asks the kernel.
+    */
+   if (reln->smgr_cached_nblocks[forknum] == blocknum)
+       reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+   else
+       reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
  /*
   * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
   *
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index f85de97d0839378e149f1df7bd22ab4992ca154a..daceafd473287ad6fa343f0ec75b3f5c31b4ff51 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -106,6 +106,9 @@ extern int  FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event
  extern int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
  extern int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
  extern int FileSync(File file, uint32 wait_event_info);
+extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info);
+extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
+
  extern off_t FileSize(File file);
  extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
  extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h

index 8f32af9ef3d60d52e1d8ac31b63c4b71a831bc37..941879ee6a8fda631cea27f07f8a40601fce501f 100644 (file)
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -28,6 +28,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
  extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
  extern void mdextend(SMgrRelation reln, ForkNumber forknum,
                      BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+                        BlockNumber blocknum, int nblocks, bool skipFsync);
  extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
                        BlockNumber blocknum);
  extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index 0935144f4259736174e49ea728b7722761145fce..a9a179aabacc488b7bd5d1e9de88c05df3891c3a 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -92,6 +92,8 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
  extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
  extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                        BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+                          BlockNumber blocknum, int nblocks, bool skipFsync);
  extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
                          BlockNumber blocknum);
  extern void smgrread(SMgrRelation reln, ForkNumber forknum,
author	Andres Freund <andres@anarazel.de>
	Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)
committer	Andres Freund <andres@anarazel.de>
	Wed, 5 Apr 2023 17:06:39 +0000 (10:06 -0700)
src/backend/storage/file/fd.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/include/storage/fd.h		patch \| blob \| blame \| history
src/include/storage/md.h		patch \| blob \| blame \| history
src/include/storage/smgr.h		patch \| blob \| blame \| history