Preallocate some DSM space at startup.
authorThomas Munro <tmunro@postgresql.org>
Fri, 31 Jul 2020 05:27:09 +0000 (17:27 +1200)
committerThomas Munro <tmunro@postgresql.org>
Fri, 31 Jul 2020 05:49:58 +0000 (17:49 +1200)
Create an optional region in the main shared memory segment that can be
used to acquire and release "fast" DSM segments, and can benefit from
huge pages allocated at cluster startup time, if configured.  Fall back
to the existing mechanisms when that space is full.  The size is
controlled by a new GUC min_dynamic_shared_memory, defaulting to 0.

Main region DSM segments initially contain whatever garbage the memory
held last time they were used, rather than zeroes.  That change revealed
that DSA areas failed to initialize themselves correctly in memory that
wasn't zeroed first, so fix that problem.

Discussion: https://postgr.es/m/CA%2BhUKGLAE2QBv-WgGp%2BD9P_J-%3Dyne3zof9nfMaqq1h3EGHFXYQ%40mail.gmail.com

doc/src/sgml/config.sgml
src/backend/storage/ipc/dsm.c
src/backend/storage/ipc/dsm_impl.c
src/backend/storage/ipc/ipci.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/backend/utils/mmgr/dsa.c
src/include/storage/dsm.h
src/include/storage/dsm_impl.h

index 427947cf4962efbfc3452a0759203f99ab408037..994155ca00e226fe23f7f48add8690bd2e93014e 100644 (file)
@@ -1906,6 +1906,30 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-min-dynamic-shared-memory" xreflabel="min_dynamic_shared_memory">
+      <term><varname>min_dynamic_shared_memory</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>min_dynamic_shared_memory</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory that should be allocated at server
+        startup time for use by parallel queries.  When this memory region is
+        insufficient or exhausted by concurrent queries, new parallel queries
+        try to allocate extra shared memory temporarily from the operating
+        system using the method configured with
+        <varname>dynamic_shared_memory_type</varname>, which may be slower due
+        to memory management overheads.  Memory that is allocated at startup
+        time with <varname>min_dynamic_shared_memory</varname> is affected by
+        the <varname>huge_pages</varname> setting on operating systems where
+        that is supported, and may be more likely to benefit from larger pages
+        on operating systems where that is managed automatically.
+        The default value is <literal>0</literal> (none).
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
 
index ef64d083570a4449c171b44ddebf2b9edc9c1e97..dffbd8e82a2a21a4f008888a23da4bd1eff2b7ed 100644 (file)
 
 #include "lib/ilist.h"
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
+#include "utils/freepage.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
@@ -76,6 +78,8 @@ typedef struct dsm_control_item
 {
    dsm_handle  handle;
    uint32      refcnt;         /* 2+ = active, 1 = moribund, 0 = gone */
+   size_t      first_page;
+   size_t      npages;
    void       *impl_private_pm_handle; /* only needed on Windows */
    bool        pinned;
 } dsm_control_item;
@@ -95,10 +99,15 @@ static dsm_segment *dsm_create_descriptor(void);
 static bool dsm_control_segment_sane(dsm_control_header *control,
                                     Size mapped_size);
 static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
 
 /* Has this backend initialized the dynamic shared memory system yet? */
 static bool dsm_init_done = false;
 
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
 /*
  * List of dynamic shared memory segments used by this backend.
  *
@@ -171,7 +180,7 @@ dsm_postmaster_startup(PGShmemHeader *shim)
    {
        Assert(dsm_control_address == NULL);
        Assert(dsm_control_mapped_size == 0);
-       dsm_control_handle = random();
+       dsm_control_handle = random() << 1; /* Even numbers only */
        if (dsm_control_handle == DSM_HANDLE_INVALID)
            continue;
        if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
@@ -247,8 +256,12 @@ dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
        if (refcnt == 0)
            continue;
 
-       /* Log debugging information. */
+       /* If it was using the main shmem area, there is nothing to do. */
        handle = old_control->item[i].handle;
+       if (is_main_region_dsm_handle(handle))
+           continue;
+
+       /* Log debugging information. */
        elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
             handle, refcnt);
 
@@ -348,8 +361,11 @@ dsm_postmaster_shutdown(int code, Datum arg)
        if (dsm_control->item[i].refcnt == 0)
            continue;
 
-       /* Log debugging information. */
        handle = dsm_control->item[i].handle;
+       if (is_main_region_dsm_handle(handle))
+           continue;
+
+       /* Log debugging information. */
        elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
             handle);
 
@@ -418,6 +434,45 @@ dsm_set_control_handle(dsm_handle h)
 }
 #endif
 
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+   return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+   size_t      size = dsm_estimate_size();
+   bool        found;
+
+   if (size == 0)
+       return;
+
+   dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+   if (!found)
+   {
+       FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+       size_t      first_page = 0;
+       size_t      pages;
+
+       /* Reserve space for the FreePageManager. */
+       while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+           ++first_page;
+
+       /* Initialize it and give it all the rest of the space. */
+       FreePageManagerInitialize(fpm, dsm_main_space_begin);
+       pages = (size / FPM_PAGE_SIZE) - first_page;
+       FreePageManagerPut(fpm, first_page, pages);
+   }
+}
+
 /*
  * Create a new dynamic shared memory segment.
  *
@@ -434,6 +489,10 @@ dsm_create(Size size, int flags)
    dsm_segment *seg;
    uint32      i;
    uint32      nitems;
+   size_t      npages = 0;
+   size_t      first_page = 0;
+   FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+   bool        using_main_dsm_region = false;
 
    /* Unsafe in postmaster (and pointless in a stand-alone backend). */
    Assert(IsUnderPostmaster);
@@ -444,20 +503,48 @@ dsm_create(Size size, int flags)
    /* Create a new segment descriptor. */
    seg = dsm_create_descriptor();
 
-   /* Loop until we find an unused segment identifier. */
-   for (;;)
+   /*
+    * Lock the control segment while we try to allocate from the main shared
+    * memory area, if configured.
+    */
+   if (dsm_main_space_fpm)
    {
-       Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
-       seg->handle = random();
-       if (seg->handle == DSM_HANDLE_INVALID)  /* Reserve sentinel */
-           continue;
-       if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
-                       &seg->mapped_address, &seg->mapped_size, ERROR))
-           break;
+       npages = size / FPM_PAGE_SIZE;
+       if (size % FPM_PAGE_SIZE > 0)
+           ++npages;
+
+       LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+       if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+       {
+           /* We can carve out a piece of the main shared memory segment. */
+           seg->mapped_address = (char *) dsm_main_space_begin +
+               first_page * FPM_PAGE_SIZE;
+           seg->mapped_size = npages * FPM_PAGE_SIZE;
+           using_main_dsm_region = true;
+           /* We'll choose a handle below. */
+       }
    }
 
-   /* Lock the control segment so we can register the new segment. */
-   LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+   if (!using_main_dsm_region)
+   {
+       /*
+        * We need to create a new memory segment.  Loop until we find an
+        * unused segment identifier.
+        */
+       if (dsm_main_space_fpm)
+           LWLockRelease(DynamicSharedMemoryControlLock);
+       for (;;)
+       {
+           Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+           seg->handle = random() << 1;    /* Even numbers only */
+           if (seg->handle == DSM_HANDLE_INVALID)  /* Reserve sentinel */
+               continue;
+           if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+                           &seg->mapped_address, &seg->mapped_size, ERROR))
+               break;
+       }
+       LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+   }
 
    /* Search the control segment for an unused slot. */
    nitems = dsm_control->nitems;
@@ -465,6 +552,14 @@ dsm_create(Size size, int flags)
    {
        if (dsm_control->item[i].refcnt == 0)
        {
+           if (using_main_dsm_region)
+           {
+               seg->handle = make_main_region_dsm_handle(i);
+               dsm_control->item[i].first_page = first_page;
+               dsm_control->item[i].npages = npages;
+           }
+           else
+               Assert(!is_main_region_dsm_handle(seg->handle));
            dsm_control->item[i].handle = seg->handle;
            /* refcnt of 1 triggers destruction, so start at 2 */
            dsm_control->item[i].refcnt = 2;
@@ -479,9 +574,12 @@ dsm_create(Size size, int flags)
    /* Verify that we can support an additional mapping. */
    if (nitems >= dsm_control->maxitems)
    {
+       if (using_main_dsm_region)
+           FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
        LWLockRelease(DynamicSharedMemoryControlLock);
-       dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
-                   &seg->mapped_address, &seg->mapped_size, WARNING);
+       if (!using_main_dsm_region)
+           dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+                       &seg->mapped_address, &seg->mapped_size, WARNING);
        if (seg->resowner != NULL)
            ResourceOwnerForgetDSM(seg->resowner, seg);
        dlist_delete(&seg->node);
@@ -495,6 +593,12 @@ dsm_create(Size size, int flags)
    }
 
    /* Enter the handle into a new array slot. */
+   if (using_main_dsm_region)
+   {
+       seg->handle = make_main_region_dsm_handle(nitems);
+       dsm_control->item[i].first_page = first_page;
+       dsm_control->item[i].npages = npages;
+   }
    dsm_control->item[nitems].handle = seg->handle;
    /* refcnt of 1 triggers destruction, so start at 2 */
    dsm_control->item[nitems].refcnt = 2;
@@ -580,6 +684,12 @@ dsm_attach(dsm_handle h)
        /* Otherwise we've found a match. */
        dsm_control->item[i].refcnt++;
        seg->control_slot = i;
+       if (is_main_region_dsm_handle(seg->handle))
+       {
+           seg->mapped_address = (char *) dsm_main_space_begin +
+               dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+           seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+       }
        break;
    }
    LWLockRelease(DynamicSharedMemoryControlLock);
@@ -597,8 +707,9 @@ dsm_attach(dsm_handle h)
    }
 
    /* Here's where we actually try to map the segment. */
-   dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
-               &seg->mapped_address, &seg->mapped_size, ERROR);
+   if (!is_main_region_dsm_handle(seg->handle))
+       dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+                   &seg->mapped_address, &seg->mapped_size, ERROR);
 
    return seg;
 }
@@ -688,8 +799,9 @@ dsm_detach(dsm_segment *seg)
     */
    if (seg->mapped_address != NULL)
    {
-       dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
-                   &seg->mapped_address, &seg->mapped_size, WARNING);
+       if (!is_main_region_dsm_handle(seg->handle))
+           dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+                       &seg->mapped_address, &seg->mapped_size, WARNING);
        seg->impl_private = NULL;
        seg->mapped_address = NULL;
        seg->mapped_size = 0;
@@ -729,10 +841,15 @@ dsm_detach(dsm_segment *seg)
             * other reason, the postmaster may not have any better luck than
             * we did.  There's not much we can do about that, though.
             */
-           if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+           if (is_main_region_dsm_handle(seg->handle) ||
+               dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
                            &seg->mapped_address, &seg->mapped_size, WARNING))
            {
                LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+               if (is_main_region_dsm_handle(seg->handle))
+                   FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+                                      dsm_control->item[control_slot].first_page,
+                                      dsm_control->item[control_slot].npages);
                Assert(dsm_control->item[control_slot].handle == seg->handle);
                Assert(dsm_control->item[control_slot].refcnt == 1);
                dsm_control->item[control_slot].refcnt = 0;
@@ -894,10 +1011,15 @@ dsm_unpin_segment(dsm_handle handle)
         * pass the mapped size, mapped address, and private data as NULL
         * here.
         */
-       if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+       if (is_main_region_dsm_handle(handle) ||
+           dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
                        &junk_mapped_address, &junk_mapped_size, WARNING))
        {
            LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+           if (is_main_region_dsm_handle(handle))
+               FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+                                  dsm_control->item[control_slot].first_page,
+                                  dsm_control->item[control_slot].npages);
            Assert(dsm_control->item[control_slot].handle == handle);
            Assert(dsm_control->item[control_slot].refcnt == 1);
            dsm_control->item[control_slot].refcnt = 0;
@@ -1094,3 +1216,28 @@ dsm_control_bytes_needed(uint32 nitems)
    return offsetof(dsm_control_header, item)
        + sizeof(dsm_control_item) * (uint64) nitems;
 }
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+   dsm_handle  handle;
+
+   /*
+    * We need to create a handle that doesn't collide with any existing extra
+    * segment created by dsm_impl_op(), so we'll make it odd.  It also
+    * mustn't collide with any other main area pseudo-segment, so we'll
+    * include the slot number in some of the bits.  We also want to make an
+    * effort to avoid newly created and recently destroyed handles from being
+    * confused, so we'll make the rest of the bits random.
+    */
+   handle = 1;
+   handle |= slot << 1;
+   handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+   return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+   return handle & 1;
+}
index 1972aecbedc1aab7a235558a81e1a16308d01da7..d4306418dcb2442617f3b198fd664fff6d89d0db 100644 (file)
@@ -113,6 +113,9 @@ const struct config_enum_entry dynamic_shared_memory_options[] = {
 /* Implementation selector. */
 int            dynamic_shared_memory_type;
 
+/* Amount of space reserved for DSM segments in the main area. */
+int            min_dynamic_shared_memory;
+
 /* Size of buffer to be used for zero-filling. */
 #define ZBUFFER_SIZE               8192
 
index e850ebd131e3f81f2422aaf4bc7ee1deb060766d..96c2aaabbd65cec1d7c9fac5a6a16e2396692394 100644 (file)
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
        size = add_size(size, SpinlockSemaSize());
        size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
                                                 sizeof(ShmemIndexEnt)));
+       size = add_size(size, dsm_estimate_size());
        size = add_size(size, BufferShmemSize());
        size = add_size(size, LockShmemSize());
        size = add_size(size, PredicateLockShmemSize());
@@ -209,6 +210,8 @@ CreateSharedMemoryAndSemaphores(void)
     */
    InitShmemIndex();
 
+   dsm_shmem_init();
+
    /*
     * Set up xlog, clog, and buffers
     */
index c20885e97b2035d684b4f9794115058f7f521d97..6c6bb220149326d0b16caf6ce3a1a53bf60584ab 100644 (file)
@@ -2231,6 +2231,17 @@ static struct config_int ConfigureNamesInt[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"min_dynamic_shared_memory", PGC_POSTMASTER, RESOURCES_MEM,
+           gettext_noop("Amount of dynamic shared memory reserved at startup."),
+           NULL,
+           GUC_UNIT_MB
+       },
+       &min_dynamic_shared_memory,
+       0, 0, Min(INT_MAX, SIZE_MAX / 1024 / 1024),
+       NULL, NULL, NULL
+   },
+
    /*
     * We sometimes multiply the number of shared buffers by two without
     * checking for overflow, so we mustn't allow more than INT_MAX / 2.
index aa30291ea3964a060f8a40617899aac4a0204d64..b0715ae1881803dda187ae804d07b6a6f6addbde 100644 (file)
                    #   windows
                    #   mmap
                    # (change requires restart)
+#min_dynamic_shared_memory = 0MB   # (change requires restart)
 
 # - Disk -
 
index b7ad8e62ef3f416b4690be3132e79e7e2f8435fc..6e5e412429789b3f4a05a802e6c6186e1d19371e 100644 (file)
@@ -1223,6 +1223,7 @@ create_internal(void *place, size_t size,
     * space.
     */
    control = (dsa_area_control *) place;
+   memset(place, 0, sizeof(*control));
    control->segment_header.magic =
        DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
    control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
@@ -1233,14 +1234,10 @@ create_internal(void *place, size_t size,
    control->handle = control_handle;
    control->max_total_segment_size = (size_t) -1;
    control->total_segment_size = size;
-   memset(&control->segment_handles[0], 0,
-          sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
    control->segment_handles[0] = control_handle;
    for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
        control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
-   control->high_segment_index = 0;
    control->refcnt = 1;
-   control->freed_segment_counter = 0;
    control->lwlock_tranche_id = tranche_id;
 
    /*
index 408c0543a6354e22c97cfbc997d61da1a01cd471..0455576f4af4c858efe11b6fad2625978a574969 100644 (file)
@@ -29,6 +29,9 @@ extern void dsm_postmaster_startup(struct PGShmemHeader *);
 extern void dsm_backend_shutdown(void);
 extern void dsm_detach_all(void);
 
+extern size_t dsm_estimate_size(void);
+extern void dsm_shmem_init(void);
+
 #ifdef EXEC_BACKEND
 extern void dsm_set_control_handle(dsm_handle h);
 #endif
index 562cb781a812c7465a53c33a0052d6953db54259..f6841e2534f96ed65e30f80e7f01e70e0fdfbe51 100644 (file)
@@ -40,6 +40,7 @@
 
 /* GUC. */
 extern int dynamic_shared_memory_type;
+extern int min_dynamic_shared_memory;
 
 /*
  * Directory for on-disk state.