Skip to content

Commit 661c0bf

Browse files
erthalionCommitfest Bot
authored andcommitted
Address space reservation for shared memory
Currently the shared memory layout is designed to pack everything tight together, leaving no space between mappings for resizing. Here is how it looks like for one mapping in /proc/$PID/maps, /dev/zero represents the anonymous shared memory we talk about: 00400000-00490000 /path/bin/postgres ... 012d9000-0133e000 [heap] 7f443a800000-7f470a800000 /dev/zero (deleted) 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libstdc++.so.6.0.34 ... Make the layout more dynamic via splitting every shared memory segment into two parts: * An anonymous file, which actually contains shared memory content. Such an anonymous file is created via memfd_create, it lives in memory, behaves like a regular file and semantically equivalent to an anonymous memory allocated via mmap with MAP_ANONYMOUS. * A reservation mapping, which size is much larger than required shared segment size. This mapping is created with flags PROT_NONE (which makes sure the reserved space is not used), and MAP_NORESERVE (to not count the reserved space against memory limits). The anonymous file is mapped into this reservation mapping. The resulting layout looks like this: 00400000-00490000 /path/bin/postgres ... 3f526000-3f590000 rw-p [heap] 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) -- anon file 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) -- reservation 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 To resize a shared memory segment in this layout it's possible to use ftruncate on the anonymous file, adjusting access permissions on the reserved space as needed. This approach also do not impact the actual memory usage as reported by the kernel. Here is the output of /proc/$PID/status for the master version with shared_buffers = 128 MB: // Peak virtual memory size, which is described as total pages // mapped in mm_struct. It corresponds to the mapped reserved space // and is the only number that grows with it. VmPeak: 2043192 kB // Size of memory portions. It contains RssAnon + RssFile + RssShmem VmRSS: 22908 kB // Size of resident anonymous memory RssAnon: 768 kB // Size of resident file mappings RssFile: 10364 kB // Size of resident shmem memory (includes SysV shm, mapping of tmpfs and // shared anonymous mappings) RssShmem: 11776 kB Here is the same for the patch when reserving 20GB of space: VmPeak: 21255824 kB VmRSS: 25020 kB RssAnon: 768 kB RssFile: 10812 kB RssShmem: 13440 kB Cgroup v2 doesn't have any problems with that as well. To verify a new cgroup was created with the memory limit 256 MB, then PostgreSQL was launched withing this cgroup with shared_buffers = 128 MB: $ cd /sys/fs/cgroup $ mkdir postgres $ cd postres $ echo 268435456 > memory.max $ echo $MASTER_PID_SHELL > cgroup.procs # postgres from the master branch has being successfully launched # from that shell $ cat memory.current 17465344 (~16.6 MB) # stop postgres $ echo $PATCH_PID_SHELL > cgroup.procs # postgres from the patch has being successfully launched from that shell $ cat memory.current 20770816 (~19.8 MB) To control the amount of space reserved a new GUC max_available_memory is introduced. Ideally it should be based on the maximum available memory, hense the name. There are also few unrelated advantages of using anon files: * We've got a file descriptor, which could be used for regular file operations (modification, truncation, you name it). * The file could be given a name, which improves readability when it comes to process maps. * By default, Linux will not add file-backed shared mappings into a core dump, making it more convenient to work with them in PostgreSQL: no more huge dumps to process. The downside is that memfd_create is Linux specific.
1 parent 4903c01 commit 661c0bf

File tree

9 files changed

+262
-60
lines changed

9 files changed

+262
-60
lines changed

src/backend/port/sysv_shmem.c

Lines changed: 235 additions & 55 deletions
Large diffs are not rendered by default.

src/backend/port/win32_shmem.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild)
627627
* use GetLargePageMinimum() instead.
628628
*/
629629
void
630-
GetHugePageSize(Size *hugepagesize, int *mmap_flags)
630+
GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
631631
{
632632
if (hugepagesize)
633633
*hugepagesize = 0;

src/backend/storage/ipc/ipci.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ CreateSharedMemoryAndSemaphores(void)
206206

207207
Assert(!IsUnderPostmaster);
208208

209+
/* Decide if we use huge pages or regular size pages */
210+
PrepareHugePages();
211+
209212
for(int segment = 0; segment < ANON_MAPPINGS; segment++)
210213
{
211214
/* Compute the size of the shared-memory block */
@@ -377,7 +380,7 @@ InitializeShmemGUCs(void)
377380
/*
378381
* Calculate the number of huge pages required.
379382
*/
380-
GetHugePageSize(&hp_size, NULL);
383+
GetHugePageSize(&hp_size, NULL, NULL);
381384
if (hp_size != 0)
382385
{
383386
Size hp_required;

src/backend/storage/ipc/shmem.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -817,7 +817,7 @@ pg_get_shmem_pagesize(void)
817817
Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
818818

819819
if (huge_pages_status == HUGE_PAGES_ON)
820-
GetHugePageSize(&os_page_size, NULL);
820+
GetHugePageSize(&os_page_size, NULL, NULL);
821821

822822
return os_page_size;
823823
}

src/backend/utils/init/globals.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ int max_parallel_maintenance_workers = 2;
140140
* register background workers.
141141
*/
142142
int NBuffers = 16384;
143+
int MaxAvailableMemory = 524288;
143144
int MaxConnections = 100;
144145
int max_worker_processes = 8;
145146
int max_parallel_workers = 8;

src/backend/utils/misc/guc_tables.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2376,6 +2376,20 @@ struct config_int ConfigureNamesInt[] =
23762376
NULL, NULL, NULL
23772377
},
23782378

2379+
{
2380+
{"max_available_memory", PGC_SIGHUP, RESOURCES_MEM,
2381+
gettext_noop("Sets the upper limit for the shared_buffers value."),
2382+
gettext_noop("Shared memory could be resized at runtime, this "
2383+
"parameters sets the upper limit for it, beyond which "
2384+
"resizing would not be supported. Normally this value "
2385+
"would be the same as the total available memory."),
2386+
GUC_UNIT_BLOCKS
2387+
},
2388+
&MaxAvailableMemory,
2389+
524288, 16, INT_MAX / 2,
2390+
NULL, NULL, NULL
2391+
},
2392+
23792393
{
23802394
{"vacuum_buffer_usage_limit", PGC_USERSET, RESOURCES_MEM,
23812395
gettext_noop("Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum."),

src/include/miscadmin.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ extern PGDLLIMPORT char *DataDir;
173173
extern PGDLLIMPORT int data_directory_mode;
174174

175175
extern PGDLLIMPORT int NBuffers;
176+
extern PGDLLIMPORT int MaxAvailableMemory;
176177
extern PGDLLIMPORT int MaxBackends;
177178
extern PGDLLIMPORT int MaxConnections;
178179
extern PGDLLIMPORT int max_worker_processes;

src/include/portability/mem.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
#define MAP_NOSYNC 0
3939
#endif
4040

41-
#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
41+
#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE)
4242

4343
/* Some really old systems don't define MAP_FAILED. */
4444
#ifndef MAP_FAILED

src/include/storage/pg_shmem.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ extern PGDLLIMPORT int shared_memory_type;
6161
extern PGDLLIMPORT int huge_pages;
6262
extern PGDLLIMPORT int huge_page_size;
6363
extern PGDLLIMPORT int huge_pages_status;
64+
extern PGDLLIMPORT int MaxAvailableMemory;
6465

6566
/* Possible values for huge_pages and huge_pages_status */
6667
typedef enum
@@ -104,7 +105,9 @@ extern PGShmemHeader *PGSharedMemoryCreate(Size size,
104105
PGShmemHeader **shim);
105106
extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
106107
extern void PGSharedMemoryDetach(void);
107-
extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
108+
extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags,
109+
int *memfd_flags);
110+
void PrepareHugePages(void);
108111

109112
/* The main segment, contains everything except buffer blocks and related data. */
110113
#define MAIN_SHMEM_SEGMENT 0

0 commit comments

Comments
 (0)