Allow using huge TLB pages on Linux (MAP_HUGETLB)
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 29 Jan 2014 11:44:45 +0000 (13:44 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 29 Jan 2014 12:08:30 +0000 (14:08 +0200)
This patch adds an option, huge_tlb_pages, which allows requesting the
shared memory segment to be allocated using huge pages, by using the
MAP_HUGETLB flag in mmap(). This can improve performance.

The default is 'try', which means that we will attempt using huge pages,
and fall back to non-huge pages if it doesn't work. Currently, only Linux
has MAP_HUGETLB. On other platforms, the default 'try' behaves the same as
'off'.

In the passing, don't try to round the mmap() size to a multiple of
pagesize. mmap() doesn't require that, and there's no particular reason for
PostgreSQL to do that either. When using MAP_HUGETLB, however, round the
request size up to nearest 2MB boundary. This is to work around a bug in
some Linux kernel versions, but also to avoid wasting memory, because the
kernel will round the size up anyway.

Many people were involved in writing this patch, including Christian Kruse,
Richard Poole, Abhijit Menon-Sen, reviewed by Peter Geoghegan, Andres Freund
and me.

doc/src/sgml/config.sgml
src/backend/port/sysv_shmem.c
src/backend/port/win32_shmem.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/storage/pg_shmem.h

index 14ed6c7a53bab78481e1e7f9f0899f1dcfe98766..e7c255987da74c9e4b012d23dfd532411e0104fc 100644 (file)
@@ -1107,6 +1107,43 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management, increasing performance. For
+        more details, see
+        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
+        Remember that you will need at least shared_buffers / huge page size +
+        1 huge TLB pages. So for example for a system with 6GB shared buffers
+        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if that fails. With <literal>on</literal, failure
+        to use huge pages will prevent the server from starting up. With
+        <literal>off</literal>, huge pages will not be used.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
index 0d01617e2f5eccb3476d8aba2f7e38f6a4af65d6..f7596bf6e0b8d5a66df2ad01559c162b550fd6af 100644 (file)
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;            /* shared memory key passed to shmget(2) */
@@ -41,7 +42,7 @@ typedef int IpcMemoryId;              /* shared memory ID returned by shmget(2) */
 unsigned long UsedShmemSegID = 0;
 void      *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
-static void *AnonymousShmem;
+static void *AnonymousShmem = NULL;
 
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
        return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the requested size in *size.  This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a segment that is
+ * larger than requested.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+       Size            allocsize;
+       void       *ptr = MAP_FAILED;
+
+#ifndef MAP_HUGETLB
+       if (huge_tlb_pages == HUGE_TLB_ON)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("huge TLB pages not supported on this platform")));
+#else
+       if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+       {
+               /*
+                * Round up the request size to a suitable large value.
+                *
+                * Some Linux kernel versions are known to have a bug, which causes
+                * mmap() with MAP_HUGETLB to fail if the request size is not a
+                * multiple of any supported huge page size. To work around that, we
+                * round up the request size to nearest 2MB. 2MB is the most common
+                * huge page page size on affected systems.
+                *
+                * Aside from that bug, even with a kernel that does the allocation
+                * correctly, rounding it up ourselves avoids wasting memory. Without
+                * it, if we for example make an allocation of 2MB + 1 bytes, the
+                * kernel might decide to use two 2MB huge pages for that, and waste 2
+                * MB - 1 of memory. When we do the rounding ourselves, we can use
+                * that space for allocations.
+                */
+               int                     hugepagesize = 2 * 1024 * 1024;
+
+               allocsize = *size;
+               if (allocsize % hugepagesize != 0)
+                       allocsize += hugepagesize - (allocsize % hugepagesize);
+
+               ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+                                  PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+               if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+                       elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
+       }
+#endif
+
+       if (huge_tlb_pages == HUGE_TLB_OFF ||
+               (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
+       {
+               allocsize = *size;
+               ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
+       }
+
+       if (ptr == MAP_FAILED)
+               ereport(FATAL,
+                               (errmsg("could not map anonymous shared memory: %m"),
+                                (errno == ENOMEM) ?
+                                errhint("This error usually means that PostgreSQL's request "
+                                       "for a shared memory segment exceeded available memory, "
+                                         "swap space or huge pages. To reduce the request size "
+                                                "(currently  %zu bytes), reduce PostgreSQL's shared "
+                                          "memory usage, perhaps by reducing shared_buffers or "
+                                                "max_connections.",
+                                                *size) : 0));
+
+       *size = allocsize;
+       return ptr;
+}
+#endif
 
 /*
  * PGSharedMemoryCreate
@@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
        PGShmemHeader *hdr;
        IpcMemoryId shmid;
        struct stat statbuf;
-       Size            sysvsize = size;
+       Size            sysvsize;
+
+#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
+       if (huge_tlb_pages == HUGE_TLB_ON)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("huge TLB pages not supported on this platform")));
+#endif
 
        /* Room for a header? */
        Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
@@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
         * to run many copies of PostgreSQL without needing to adjust system
         * settings.
         *
+        * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+        * systems that are ancient enough that anonymous shared memory is not
+        * supported, such as pre-2.4 versions of Linux.  If that turns out to be
+        * false, we might need to add a run-time test here and do this only if
+        * the running kernel supports it.
+        *
         * However, we disable this logic in the EXEC_BACKEND case, and fall back
         * to the old method of allocating the entire segment using System V
         * shared memory, because there's no way to attach an mmap'd segment to a
@@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
         * developer use, this shouldn't be a big problem.
         */
 #ifndef EXEC_BACKEND
-       {
-               long            pagesize = sysconf(_SC_PAGE_SIZE);
-
-               /*
-                * Ensure request size is a multiple of pagesize.
-                *
-                * pagesize will, for practical purposes, always be a power of two.
-                * But just in case it isn't, we do it this way instead of using
-                * TYPEALIGN().
-                */
-               if (pagesize > 0 && size % pagesize != 0)
-                       size += pagesize - (size % pagesize);
+       AnonymousShmem = CreateAnonymousSegment(&size);
+       AnonymousShmemSize = size;
 
-               /*
-                * We assume that no one will attempt to run PostgreSQL 9.3 or later
-                * on systems that are ancient enough that anonymous shared memory is
-                * not supported, such as pre-2.4 versions of Linux.  If that turns
-                * out to be false, we might need to add a run-time test here and do
-                * this only if the running kernel supports it.
-                */
-               AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-                                                         -1, 0);
-               if (AnonymousShmem == MAP_FAILED)
-                       ereport(FATAL,
-                                       (errmsg("could not map anonymous shared memory: %m"),
-                                        (errno == ENOMEM) ?
-                               errhint("This error usually means that PostgreSQL's request "
-                                        "for a shared memory segment exceeded available memory "
-                                         "or swap space. To reduce the request size (currently "
-                                         "%zu bytes), reduce PostgreSQL's shared memory usage, "
-                                               "perhaps by reducing shared_buffers or "
-                                               "max_connections.",
-                                               size) : 0));
-               AnonymousShmemSize = size;
-
-               /* Now we need only allocate a minimal-sized SysV shmem block. */
-               sysvsize = sizeof(PGShmemHeader);
-       }
+       /* Now we need only allocate a minimal-sized SysV shmem block. */
+       sysvsize = sizeof(PGShmemHeader);
+#else
+       sysvsize = size;
 #endif
 
        /* Make sure PGSharedMemoryAttach doesn't fail without need */
index 80f198277a3c89a5810773f91d6668087c516739..9b0cceb5309a1ac299c7329fafe4eb3dc42b1117 100644 (file)
@@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
        DWORD           size_high;
        DWORD           size_low;
 
+       if (huge_tlb_pages == HUGE_TLB_ON)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("huge TLB pages not supported on this platform")));
+
        /* Room for a header? */
        Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
 
index 2cc8f90e6d4b3562171198607abcd3738b12b121..a9b9794965b6b9961a8b831e204bdf5372026b6a 100644 (file)
@@ -64,6 +64,7 @@
 #include "storage/dsm_impl.h"
 #include "storage/standby.h"
 #include "storage/fd.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/predicate.h"
 #include "tcop/tcopprot.h"
@@ -387,6 +388,23 @@ static const struct config_enum_entry synchronous_commit_options[] = {
        {NULL, 0, false}
 };
 
+/*
+ * Although only "on", "off", "try" are documented, we accept all the likely
+ * variants of "on" and "off".
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+       {"off", HUGE_TLB_OFF, false},
+       {"on", HUGE_TLB_ON, false},
+       {"try", HUGE_TLB_TRY, false},
+       {"true", HUGE_TLB_ON, true},
+       {"false", HUGE_TLB_OFF, true},
+       {"yes", HUGE_TLB_ON, true},
+       {"no", HUGE_TLB_OFF, true},
+       {"1", HUGE_TLB_ON, true},
+       {"0", HUGE_TLB_OFF, true},
+       {NULL, 0, false}
+};
+
 /*
  * Options for enum values stored in other modules
  */
@@ -447,6 +465,12 @@ int                        tcp_keepalives_idle;
 int                    tcp_keepalives_interval;
 int                    tcp_keepalives_count;
 
+/*
+ * This really belongs in pg_shmem.c, but is defined here so that it doesn't
+ * need to be duplicated in all the different implementations of pg_shmem.c.
+ */
+int                    huge_tlb_pages;
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3430,6 +3454,15 @@ static struct config_enum ConfigureNamesEnum[] =
                NULL, NULL, NULL
        },
 
+       {
+               {"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
+                       gettext_noop("Use of huge TLB pages on Linux"),
+                       NULL
+               },
+               &huge_tlb_pages,
+               HUGE_TLB_TRY, huge_tlb_options,
+               NULL, NULL, NULL
+       },
 
        /* End-of-list marker */
        {
index 7ad6b7cb4578c458cbbf429a7c2942300f416da2..c8673b382da2ae557e9c2287ea38f6b02f63d8b8 100644 (file)
 
 #shared_buffers = 32MB                 # min 128kB
                                        # (change requires restart)
+#huge_tlb_pages = try                  # on, off, or try
+                                       # (change requires restart)
 #temp_buffers = 8MB                    # min 800kB
 #max_prepared_transactions = 0         # zero disables the feature
                                        # (change requires restart)
index 22ef901e8912cc4f59fdf728ce4b49cd8defbde5..df094e801d939b4fb45c19f697ba96177847e07e 100644 (file)
@@ -38,6 +38,16 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
 #endif
 } PGShmemHeader;
 
+/* GUC variable */
+extern int huge_tlb_pages;
+
+/* Possible values for huge_tlb_pages */
+typedef enum
+{
+       HUGE_TLB_OFF,
+       HUGE_TLB_ON,
+       HUGE_TLB_TRY
+} HugeTlbType;
 
 #ifdef EXEC_BACKEND
 #ifndef WIN32