Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dsa.c
4 : * Dynamic shared memory areas.
5 : *
6 : * This module provides dynamic shared memory areas which are built on top of
7 : * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 : * created and shared between backends, it isn't designed to deal with small
9 : * objects. A DSA area is a shared memory heap usually backed by one or more
10 : * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 : * Alternatively, it can be created in pre-existing shared memory, including a
12 : * DSM segment, and then create extra DSM segments as required. Unlike the
13 : * regular system heap, it deals in pseudo-pointers which must be converted to
14 : * backend-local pointers before they are dereferenced. These pseudo-pointers
15 : * can however be shared with other backends, and can be used to construct
16 : * shared data structures.
17 : *
18 : * Each DSA area manages a set of DSM segments, adding new segments as
19 : * required and detaching them when they are no longer needed. Each segment
20 : * contains a number of 4KB pages, a free page manager for tracking
21 : * consecutive runs of free pages, and a page map for tracking the source of
22 : * objects allocated on each page. Allocation requests above 8KB are handled
23 : * by choosing a segment and finding consecutive free pages in its free page
24 : * manager. Allocation requests for smaller sizes are handled using pools of
25 : * objects of a selection of sizes. Each pool consists of a number of 16 page
26 : * (64KB) superblocks allocated in the same way as large objects. Allocation
27 : * of large objects and new superblocks is serialized by a single LWLock, but
28 : * allocation of small objects from pre-existing superblocks uses one LWLock
29 : * per pool. Currently there is one pool, and therefore one lock, per size
30 : * class. Per-core pools to increase concurrency and strategies for reducing
31 : * the resulting fragmentation are areas for future research. Each superblock
32 : * is managed with a 'span', which tracks the superblock's freelist. Free
33 : * requests are handled by looking in the page map to find which span an
34 : * address was allocated from, so that small objects can be returned to the
35 : * appropriate free list, and large object pages can be returned directly to
36 : * the free page map. When allocating, simple heuristics for selecting
37 : * segments and superblocks try to encourage occupied memory to be
38 : * concentrated, increasing the likelihood that whole superblocks can become
39 : * empty and be returned to the free page manager, and whole segments can
40 : * become empty and be returned to the operating system.
41 : *
42 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
43 : * Portions Copyright (c) 1994, Regents of the University of California
44 : *
45 : * IDENTIFICATION
46 : * src/backend/utils/mmgr/dsa.c
47 : *
48 : *-------------------------------------------------------------------------
49 : */
50 :
51 : #include "postgres.h"
52 :
53 : #include "port/atomics.h"
54 : #include "port/pg_bitutils.h"
55 : #include "storage/dsm.h"
56 : #include "storage/lwlock.h"
57 : #include "utils/dsa.h"
58 : #include "utils/freepage.h"
59 : #include "utils/memutils.h"
60 : #include "utils/resowner.h"
61 :
62 : /*
63 : * How many segments to create before we double the segment size. If this is
64 : * low, then there is likely to be a lot of wasted space in the largest
65 : * segment. If it is high, then we risk running out of segment slots (see
66 : * dsm.c's limits on total number of segments), or limiting the total size
67 : * an area can manage when using small pointers.
68 : */
69 : #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
70 :
71 : /*
72 : * The maximum number of DSM segments that an area can own, determined by
73 : * the number of bits remaining (but capped at 1024).
74 : */
75 : #define DSA_MAX_SEGMENTS \
76 : Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
77 :
78 : /* The bitmask for extracting the offset from a dsa_pointer. */
79 : #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
80 :
81 : /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
82 : #define DSA_PAGES_PER_SUPERBLOCK 16
83 :
84 : /*
85 : * A magic number used as a sanity check for following DSM segments belonging
86 : * to a DSA area (this number will be XORed with the area handle and
87 : * the segment index).
88 : */
89 : #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
90 :
91 : /* Build a dsa_pointer given a segment number and offset. */
92 : #define DSA_MAKE_POINTER(segment_number, offset) \
93 : (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
94 :
95 : /* Extract the segment number from a dsa_pointer. */
96 : #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
97 :
98 : /* Extract the offset from a dsa_pointer. */
99 : #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
100 :
101 : /* The type used for index segment indexes (zero based). */
102 : typedef size_t dsa_segment_index;
103 :
104 : /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
105 : #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
106 :
107 : /*
108 : * How many bins of segments do we have? The bins are used to categorize
109 : * segments by their largest contiguous run of free pages.
110 : */
111 : #define DSA_NUM_SEGMENT_BINS 16
112 :
113 : /*
114 : * What is the lowest bin that holds segments that *might* have n contiguous
115 : * free pages? There is no point in looking in segments in lower bins; they
116 : * definitely can't service a request for n free pages.
117 : */
118 : static inline size_t
119 42680 : contiguous_pages_to_segment_bin(size_t n)
120 : {
121 : size_t bin;
122 :
123 42680 : if (n == 0)
124 1436 : bin = 0;
125 : else
126 41244 : bin = pg_leftmost_one_pos_size_t(n) + 1;
127 :
128 42680 : return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
129 : }
130 :
131 : /* Macros for access to locks. */
132 : #define DSA_AREA_LOCK(area) (&area->control->lock)
133 : #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
134 :
135 : /*
136 : * The header for an individual segment. This lives at the start of each DSM
137 : * segment owned by a DSA area including the first segment (where it appears
138 : * as part of the dsa_area_control struct).
139 : */
140 : typedef struct
141 : {
142 : /* Sanity check magic value. */
143 : uint32 magic;
144 : /* Total number of pages in this segment (excluding metadata area). */
145 : size_t usable_pages;
146 : /* Total size of this segment in bytes. */
147 : size_t size;
148 :
149 : /*
150 : * Index of the segment that precedes this one in the same segment bin, or
151 : * DSA_SEGMENT_INDEX_NONE if this is the first one.
152 : */
153 : dsa_segment_index prev;
154 :
155 : /*
156 : * Index of the segment that follows this one in the same segment bin, or
157 : * DSA_SEGMENT_INDEX_NONE if this is the last one.
158 : */
159 : dsa_segment_index next;
160 : /* The index of the bin that contains this segment. */
161 : size_t bin;
162 :
163 : /*
164 : * A flag raised to indicate that this segment is being returned to the
165 : * operating system and has been unpinned.
166 : */
167 : bool freed;
168 : } dsa_segment_header;
169 :
170 : /*
171 : * Metadata for one superblock.
172 : *
173 : * For most blocks, span objects are stored out-of-line; that is, the span
174 : * object is not stored within the block itself. But, as an exception, for a
175 : * "span of spans", the span object is stored "inline". The allocation is
176 : * always exactly one page, and the dsa_area_span object is located at
177 : * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
178 : * and the remaining fields are used just as they would be in an ordinary
179 : * block. We can't allocate spans out of ordinary superblocks because
180 : * creating an ordinary superblock requires us to be able to allocate a span
181 : * *first*. Doing it this way avoids that circularity.
182 : */
183 : typedef struct
184 : {
185 : dsa_pointer pool; /* Containing pool. */
186 : dsa_pointer prevspan; /* Previous span. */
187 : dsa_pointer nextspan; /* Next span. */
188 : dsa_pointer start; /* Starting address. */
189 : size_t npages; /* Length of span in pages. */
190 : uint16 size_class; /* Size class. */
191 : uint16 ninitialized; /* Maximum number of objects ever allocated. */
192 : uint16 nallocatable; /* Number of objects currently allocatable. */
193 : uint16 firstfree; /* First object on free list. */
194 : uint16 nmax; /* Maximum number of objects ever possible. */
195 : uint16 fclass; /* Current fullness class. */
196 : } dsa_area_span;
197 :
198 : /*
199 : * Given a pointer to an object in a span, access the index of the next free
200 : * object in the same span (ie in the span's freelist) as an L-value.
201 : */
202 : #define NextFreeObjectIndex(object) (* (uint16 *) (object))
203 :
204 : /*
205 : * Small allocations are handled by dividing a single block of memory into
206 : * many small objects of equal size. The possible allocation sizes are
207 : * defined by the following array. Larger size classes are spaced more widely
208 : * than smaller size classes. We fudge the spacing for size classes >1kB to
209 : * avoid space wastage: based on the knowledge that we plan to allocate 64kB
210 : * blocks, we bump the maximum object size up to the largest multiple of
211 : * 8 bytes that still lets us fit the same number of objects into one block.
212 : *
213 : * NB: Because of this fudging, if we were ever to use differently-sized blocks
214 : * for small allocations, these size classes would need to be reworked to be
215 : * optimal for the new size.
216 : *
217 : * NB: The optimal spacing for size classes, as well as the size of the blocks
218 : * out of which small objects are allocated, is not a question that has one
219 : * right answer. Some allocators (such as tcmalloc) use more closely-spaced
220 : * size classes than we do here, while others (like aset.c) use more
221 : * widely-spaced classes. Spacing the classes more closely avoids wasting
222 : * memory within individual chunks, but also means a larger number of
223 : * potentially-unfilled blocks.
224 : */
225 : static const uint16 dsa_size_classes[] = {
226 : sizeof(dsa_area_span), 0, /* special size classes */
227 : 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
228 : 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
229 : 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
230 : 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
231 : 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
232 : 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
233 : 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
234 : 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
235 : };
236 : #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
237 :
238 : /* Special size classes. */
239 : #define DSA_SCLASS_BLOCK_OF_SPANS 0
240 : #define DSA_SCLASS_SPAN_LARGE 1
241 :
242 : /*
243 : * The following lookup table is used to map the size of small objects
244 : * (less than 1kB) onto the corresponding size class. To use this table,
245 : * round the size of the object up to the next multiple of 8 bytes, and then
246 : * index into this array.
247 : */
248 : static const uint8 dsa_size_class_map[] = {
249 : 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
250 : 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
251 : 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
252 : 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
253 : 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
254 : 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
255 : 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
256 : 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
257 : };
258 : #define DSA_SIZE_CLASS_MAP_QUANTUM 8
259 :
260 : /*
261 : * Superblocks are binned by how full they are. Generally, each fullness
262 : * class corresponds to one quartile, but the block being used for
263 : * allocations is always at the head of the list for fullness class 1,
264 : * regardless of how full it really is.
265 : */
266 : #define DSA_FULLNESS_CLASSES 4
267 :
268 : /*
269 : * A dsa_area_pool represents a set of objects of a given size class.
270 : *
271 : * Perhaps there should be multiple pools for the same size class for
272 : * contention avoidance, but for now there is just one!
273 : */
274 : typedef struct
275 : {
276 : /* A lock protecting access to this pool. */
277 : LWLock lock;
278 : /* A set of linked lists of spans, arranged by fullness. */
279 : dsa_pointer spans[DSA_FULLNESS_CLASSES];
280 : /* Should we pad this out to a cacheline boundary? */
281 : } dsa_area_pool;
282 :
283 : /*
284 : * The control block for an area. This lives in shared memory, at the start of
285 : * the first DSM segment controlled by this area.
286 : */
287 : typedef struct
288 : {
289 : /* The segment header for the first segment. */
290 : dsa_segment_header segment_header;
291 : /* The handle for this area. */
292 : dsa_handle handle;
293 : /* The handles of the segments owned by this area. */
294 : dsm_handle segment_handles[DSA_MAX_SEGMENTS];
295 : /* Lists of segments, binned by maximum contiguous run of free pages. */
296 : dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
297 : /* The object pools for each size class. */
298 : dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
299 : /* initial allocation segment size */
300 : size_t init_segment_size;
301 : /* maximum allocation segment size */
302 : size_t max_segment_size;
303 : /* The total size of all active segments. */
304 : size_t total_segment_size;
305 : /* The maximum total size of backing storage we are allowed. */
306 : size_t max_total_segment_size;
307 : /* Highest used segment index in the history of this area. */
308 : dsa_segment_index high_segment_index;
309 : /* The reference count for this area. */
310 : int refcnt;
311 : /* A flag indicating that this area has been pinned. */
312 : bool pinned;
313 : /* The number of times that segments have been freed. */
314 : size_t freed_segment_counter;
315 : /* The LWLock tranche ID. */
316 : int lwlock_tranche_id;
317 : /* The general lock (protects everything except object pools). */
318 : LWLock lock;
319 : } dsa_area_control;
320 :
321 : /* Given a pointer to a pool, find a dsa_pointer. */
322 : #define DsaAreaPoolToDsaPointer(area, p) \
323 : DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
324 :
325 : /*
326 : * A dsa_segment_map is stored within the backend-private memory of each
327 : * individual backend. It holds the base address of the segment within that
328 : * backend, plus the addresses of key objects within the segment. Those
329 : * could instead be derived from the base address but it's handy to have them
330 : * around.
331 : */
332 : typedef struct
333 : {
334 : dsm_segment *segment; /* DSM segment */
335 : char *mapped_address; /* Address at which segment is mapped */
336 : dsa_segment_header *header; /* Header (same as mapped_address) */
337 : FreePageManager *fpm; /* Free page manager within segment. */
338 : dsa_pointer *pagemap; /* Page map within segment. */
339 : } dsa_segment_map;
340 :
341 : /*
342 : * Per-backend state for a storage area. Backends obtain one of these by
343 : * creating an area or attaching to an existing one using a handle. Each
344 : * process that needs to use an area uses its own object to track where the
345 : * segments are mapped.
346 : */
347 : struct dsa_area
348 : {
349 : /* Pointer to the control object in shared memory. */
350 : dsa_area_control *control;
351 :
352 : /*
353 : * All the mappings are owned by this. The dsa_area itself is not
354 : * directly tracked by the ResourceOwner, but the effect is the same. NULL
355 : * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
356 : * been called.
357 : */
358 : ResourceOwner resowner;
359 :
360 : /*
361 : * This backend's array of segment maps, ordered by segment index
362 : * corresponding to control->segment_handles. Some of the area's segments
363 : * may not be mapped in this backend yet, and some slots may have been
364 : * freed and need to be detached; these operations happen on demand.
365 : */
366 : dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
367 :
368 : /* The highest segment index this backend has ever mapped. */
369 : dsa_segment_index high_segment_index;
370 :
371 : /* The last observed freed_segment_counter. */
372 : size_t freed_segment_counter;
373 : };
374 :
375 : #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
376 : #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
377 :
378 : /* Given a pointer to a segment_map, obtain a segment index number. */
379 : #define get_segment_index(area, segment_map_ptr) \
380 : (segment_map_ptr - &area->segment_maps[0])
381 :
382 : static void init_span(dsa_area *area, dsa_pointer span_pointer,
383 : dsa_area_pool *pool, dsa_pointer start, size_t npages,
384 : uint16 size_class);
385 : static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
386 : int fromclass, int toclass);
387 : static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
388 : static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
389 : int size_class);
390 : static dsa_segment_map *get_segment_by_index(dsa_area *area,
391 : dsa_segment_index index);
392 : static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
393 : static void unlink_span(dsa_area *area, dsa_area_span *span);
394 : static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
395 : dsa_pointer span_pointer, int fclass);
396 : static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
397 : static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
398 : static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
399 : static dsa_area *create_internal(void *place, size_t size,
400 : int tranche_id,
401 : dsm_handle control_handle,
402 : dsm_segment *control_segment,
403 : size_t init_segment_size,
404 : size_t max_segment_size);
405 : static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 : dsa_handle handle);
407 : static void check_for_freed_segments(dsa_area *area);
408 : static void check_for_freed_segments_locked(dsa_area *area);
409 : static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
410 :
411 : /*
412 : * Create a new shared area in a new DSM segment. Further DSM segments will
413 : * be allocated as required to extend the available space.
414 : *
415 : * We can't allocate a LWLock tranche_id within this function, because tranche
416 : * IDs are a scarce resource; there are only 64k available, using low numbers
417 : * when possible matters, and we have no provision for recycling them. So,
418 : * we require the caller to provide one.
419 : */
420 : dsa_area *
421 178 : dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
422 : {
423 : dsm_segment *segment;
424 : dsa_area *area;
425 :
426 : /*
427 : * Create the DSM segment that will hold the shared control object and the
428 : * first segment of usable space.
429 : */
430 178 : segment = dsm_create(init_segment_size, 0);
431 :
432 : /*
433 : * All segments backing this area are pinned, so that DSA can explicitly
434 : * control their lifetime (otherwise a newly created segment belonging to
435 : * this area might be freed when the only backend that happens to have it
436 : * mapped in ends, corrupting the area).
437 : */
438 178 : dsm_pin_segment(segment);
439 :
440 : /* Create a new DSA area with the control object in this segment. */
441 178 : area = create_internal(dsm_segment_address(segment),
442 : init_segment_size,
443 : tranche_id,
444 : dsm_segment_handle(segment), segment,
445 : init_segment_size, max_segment_size);
446 :
447 : /* Clean up when the control segment detaches. */
448 178 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
449 178 : PointerGetDatum(dsm_segment_address(segment)));
450 :
451 178 : return area;
452 : }
453 :
454 : /*
455 : * Create a new shared area in an existing shared memory space, which may be
456 : * either DSM or Postmaster-initialized memory. DSM segments will be
457 : * allocated as required to extend the available space, though that can be
458 : * prevented with dsa_set_size_limit(area, size) using the same size provided
459 : * to dsa_create_in_place.
460 : *
461 : * Areas created in-place must eventually be released by the backend that
462 : * created them and all backends that attach to them. This can be done
463 : * explicitly with dsa_release_in_place, or, in the special case that 'place'
464 : * happens to be in a pre-existing DSM segment, by passing in a pointer to the
465 : * segment so that a detach hook can be registered with the containing DSM
466 : * segment.
467 : *
468 : * See dsa_create() for a note about the tranche arguments.
469 : */
470 : dsa_area *
471 2990 : dsa_create_in_place_ext(void *place, size_t size,
472 : int tranche_id, dsm_segment *segment,
473 : size_t init_segment_size, size_t max_segment_size)
474 : {
475 : dsa_area *area;
476 :
477 2990 : area = create_internal(place, size, tranche_id,
478 : DSM_HANDLE_INVALID, NULL,
479 : init_segment_size, max_segment_size);
480 :
481 : /*
482 : * Clean up when the control segment detaches, if a containing DSM segment
483 : * was provided.
484 : */
485 2990 : if (segment != NULL)
486 856 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
487 : PointerGetDatum(place));
488 :
489 2990 : return area;
490 : }
491 :
492 : /*
493 : * Obtain a handle that can be passed to other processes so that they can
494 : * attach to the given area. Cannot be called for areas created with
495 : * dsa_create_in_place.
496 : */
497 : dsa_handle
498 172 : dsa_get_handle(dsa_area *area)
499 : {
500 : Assert(area->control->handle != DSA_HANDLE_INVALID);
501 172 : return area->control->handle;
502 : }
503 :
504 : /*
505 : * Attach to an area given a handle generated (possibly in another process) by
506 : * dsa_get_handle. The area must have been created with dsa_create (not
507 : * dsa_create_in_place).
508 : */
509 : dsa_area *
510 332 : dsa_attach(dsa_handle handle)
511 : {
512 : dsm_segment *segment;
513 : dsa_area *area;
514 :
515 : /*
516 : * An area handle is really a DSM segment handle for the first segment, so
517 : * we go ahead and attach to that.
518 : */
519 332 : segment = dsm_attach(handle);
520 332 : if (segment == NULL)
521 0 : ereport(ERROR,
522 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
523 : errmsg("could not attach to dynamic shared area")));
524 :
525 332 : area = attach_internal(dsm_segment_address(segment), segment, handle);
526 :
527 : /* Clean up when the control segment detaches. */
528 332 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
529 332 : PointerGetDatum(dsm_segment_address(segment)));
530 :
531 332 : return area;
532 : }
533 :
534 : /*
535 : * Returns whether the area with the given handle was already attached by the
536 : * current process. The area must have been created with dsa_create (not
537 : * dsa_create_in_place).
538 : */
539 : bool
540 4 : dsa_is_attached(dsa_handle handle)
541 : {
542 : /*
543 : * An area handle is really a DSM segment handle for the first segment, so
544 : * we can just search for that.
545 : */
546 4 : return dsm_find_mapping(handle) != NULL;
547 : }
548 :
549 : /*
550 : * Attach to an area that was created with dsa_create_in_place. The caller
551 : * must somehow know the location in memory that was used when the area was
552 : * created, though it may be mapped at a different virtual address in this
553 : * process.
554 : *
555 : * See dsa_create_in_place for note about releasing in-place areas, and the
556 : * optional 'segment' argument which can be provided to allow automatic
557 : * release if the containing memory happens to be a DSM segment.
558 : */
559 : dsa_area *
560 47698 : dsa_attach_in_place(void *place, dsm_segment *segment)
561 : {
562 : dsa_area *area;
563 :
564 47698 : area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
565 :
566 : /*
567 : * Clean up when the control segment detaches, if a containing DSM segment
568 : * was provided.
569 : */
570 47698 : if (segment != NULL)
571 5258 : on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
572 : PointerGetDatum(place));
573 :
574 47698 : return area;
575 : }
576 :
577 : /*
578 : * Release a DSA area that was produced by dsa_create_in_place or
579 : * dsa_attach_in_place. The 'segment' argument is ignored but provides an
580 : * interface suitable for on_dsm_detach, for the convenience of users who want
581 : * to create a DSA segment inside an existing DSM segment and have it
582 : * automatically released when the containing DSM segment is detached.
583 : * 'place' should be the address of the place where the area was created.
584 : *
585 : * This callback is automatically registered for the DSM segment containing
586 : * the control object of in-place areas when a segment is provided to
587 : * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
588 : * with dsa_create.
589 : */
590 : void
591 6624 : dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
592 : {
593 6624 : dsa_release_in_place(DatumGetPointer(place));
594 6624 : }
595 :
596 : /*
597 : * Release a DSA area that was produced by dsa_create_in_place or
598 : * dsa_attach_in_place. The 'code' argument is ignored but provides an
599 : * interface suitable for on_shmem_exit or before_shmem_exit, for the
600 : * convenience of users who want to create a DSA segment inside shared memory
601 : * other than a DSM segment and have it automatically release at backend exit.
602 : * 'place' should be the address of the place where the area was created.
603 : */
604 : void
605 0 : dsa_on_shmem_exit_release_in_place(int code, Datum place)
606 : {
607 0 : dsa_release_in_place(DatumGetPointer(place));
608 0 : }
609 :
610 : /*
611 : * Release a DSA area that was produced by dsa_create_in_place or
612 : * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
613 : * callbacks so that this is managed automatically, because failure to release
614 : * an area created in-place leaks its segments permanently.
615 : *
616 : * This is also called automatically for areas produced by dsa_create or
617 : * dsa_attach as an implementation detail.
618 : */
619 : void
620 49064 : dsa_release_in_place(void *place)
621 : {
622 49064 : dsa_area_control *control = (dsa_area_control *) place;
623 : int i;
624 :
625 49064 : LWLockAcquire(&control->lock, LW_EXCLUSIVE);
626 : Assert(control->segment_header.magic ==
627 : (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
628 : Assert(control->refcnt > 0);
629 49064 : if (--control->refcnt == 0)
630 : {
631 2070 : for (i = 0; i <= control->high_segment_index; ++i)
632 : {
633 : dsm_handle handle;
634 :
635 1164 : handle = control->segment_handles[i];
636 1164 : if (handle != DSM_HANDLE_INVALID)
637 308 : dsm_unpin_segment(handle);
638 : }
639 : }
640 49064 : LWLockRelease(&control->lock);
641 49064 : }
642 :
643 : /*
644 : * Keep a DSA area attached until end of session or explicit detach.
645 : *
646 : * By default, areas are owned by the current resource owner, which means they
647 : * are detached automatically when that scope ends.
648 : */
649 : void
650 45726 : dsa_pin_mapping(dsa_area *area)
651 : {
652 : int i;
653 :
654 45726 : if (area->resowner != NULL)
655 : {
656 3158 : area->resowner = NULL;
657 :
658 6342 : for (i = 0; i <= area->high_segment_index; ++i)
659 3184 : if (area->segment_maps[i].segment != NULL)
660 314 : dsm_pin_mapping(area->segment_maps[i].segment);
661 : }
662 45726 : }
663 :
664 : /*
665 : * Allocate memory in this storage area. The return value is a dsa_pointer
666 : * that can be passed to other processes, and converted to a local pointer
667 : * with dsa_get_address. 'flags' is a bitmap which should be constructed
668 : * from the following values:
669 : *
670 : * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
671 : * will result in an ERROR.
672 : *
673 : * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
674 : * no memory is available or a size limit established by dsa_set_size_limit
675 : * would be exceeded. Otherwise, such allocations will result in an ERROR.
676 : *
677 : * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
678 : * contents of newly-allocated memory are indeterminate.
679 : *
680 : * These flags correspond to similarly named flags used by
681 : * MemoryContextAllocExtended(). See also the macros dsa_allocate and
682 : * dsa_allocate0 which expand to a call to this function with commonly used
683 : * flags.
684 : */
685 : dsa_pointer
686 1285740 : dsa_allocate_extended(dsa_area *area, size_t size, int flags)
687 : {
688 : uint16 size_class;
689 : dsa_pointer start_pointer;
690 : dsa_segment_map *segment_map;
691 : dsa_pointer result;
692 :
693 : Assert(size > 0);
694 :
695 : /* Sanity check on huge individual allocation size. */
696 1285740 : if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
697 1285740 : ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
698 0 : elog(ERROR, "invalid DSA memory alloc request size %zu", size);
699 :
700 : /*
701 : * If bigger than the largest size class, just grab a run of pages from
702 : * the free page manager, instead of allocating an object from a pool.
703 : * There will still be a span, but it's a special class of span that
704 : * manages this whole allocation and simply gives all pages back to the
705 : * free page manager when dsa_free is called.
706 : */
707 1285740 : if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
708 : {
709 5500 : size_t npages = fpm_size_to_pages(size);
710 : size_t first_page;
711 : dsa_pointer span_pointer;
712 5500 : dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
713 :
714 : /* Obtain a span object. */
715 5500 : span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
716 5500 : if (!DsaPointerIsValid(span_pointer))
717 : {
718 : /* Raise error unless asked not to. */
719 0 : if ((flags & DSA_ALLOC_NO_OOM) == 0)
720 0 : ereport(ERROR,
721 : (errcode(ERRCODE_OUT_OF_MEMORY),
722 : errmsg("out of memory"),
723 : errdetail("Failed on DSA request of size %zu.",
724 : size)));
725 0 : return InvalidDsaPointer;
726 : }
727 :
728 5500 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
729 :
730 : /* Find a segment from which to allocate. */
731 5500 : segment_map = get_best_segment(area, npages);
732 5500 : if (segment_map == NULL)
733 46 : segment_map = make_new_segment(area, npages);
734 5500 : if (segment_map == NULL)
735 : {
736 : /* Can't make any more segments: game over. */
737 0 : LWLockRelease(DSA_AREA_LOCK(area));
738 0 : dsa_free(area, span_pointer);
739 :
740 : /* Raise error unless asked not to. */
741 0 : if ((flags & DSA_ALLOC_NO_OOM) == 0)
742 0 : ereport(ERROR,
743 : (errcode(ERRCODE_OUT_OF_MEMORY),
744 : errmsg("out of memory"),
745 : errdetail("Failed on DSA request of size %zu.",
746 : size)));
747 0 : return InvalidDsaPointer;
748 : }
749 :
750 : /*
751 : * Ask the free page manager for a run of pages. This should always
752 : * succeed, since both get_best_segment and make_new_segment should
753 : * only return a non-NULL pointer if it actually contains enough
754 : * contiguous freespace. If it does fail, something in our backend
755 : * private state is out of whack, so use FATAL to kill the process.
756 : */
757 5500 : if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
758 0 : elog(FATAL,
759 : "dsa_allocate could not find %zu free pages", npages);
760 5500 : LWLockRelease(DSA_AREA_LOCK(area));
761 :
762 5500 : start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
763 : first_page * FPM_PAGE_SIZE);
764 :
765 : /* Initialize span and pagemap. */
766 5500 : LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
767 : LW_EXCLUSIVE);
768 5500 : init_span(area, span_pointer, pool, start_pointer, npages,
769 : DSA_SCLASS_SPAN_LARGE);
770 5500 : segment_map->pagemap[first_page] = span_pointer;
771 5500 : LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
772 :
773 : /* Zero-initialize the memory if requested. */
774 5500 : if ((flags & DSA_ALLOC_ZERO) != 0)
775 1480 : memset(dsa_get_address(area, start_pointer), 0, size);
776 :
777 5500 : return start_pointer;
778 : }
779 :
780 : /* Map allocation to a size class. */
781 1280240 : if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
782 : {
783 : int mapidx;
784 :
785 : /* For smaller sizes we have a lookup table... */
786 1235272 : mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
787 1235272 : DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
788 1235272 : size_class = dsa_size_class_map[mapidx];
789 : }
790 : else
791 : {
792 : uint16 min;
793 : uint16 max;
794 :
795 : /* ... and for the rest we search by binary chop. */
796 44968 : min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
797 44968 : max = lengthof(dsa_size_classes) - 1;
798 :
799 186016 : while (min < max)
800 : {
801 141048 : uint16 mid = (min + max) / 2;
802 141048 : uint16 class_size = dsa_size_classes[mid];
803 :
804 141048 : if (class_size < size)
805 84342 : min = mid + 1;
806 : else
807 56706 : max = mid;
808 : }
809 :
810 44968 : size_class = min;
811 : }
812 : Assert(size <= dsa_size_classes[size_class]);
813 : Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
814 :
815 : /* Attempt to allocate an object from the appropriate pool. */
816 1280240 : result = alloc_object(area, size_class);
817 :
818 : /* Check for failure to allocate. */
819 1280240 : if (!DsaPointerIsValid(result))
820 : {
821 : /* Raise error unless asked not to. */
822 0 : if ((flags & DSA_ALLOC_NO_OOM) == 0)
823 0 : ereport(ERROR,
824 : (errcode(ERRCODE_OUT_OF_MEMORY),
825 : errmsg("out of memory"),
826 : errdetail("Failed on DSA request of size %zu.", size)));
827 0 : return InvalidDsaPointer;
828 : }
829 :
830 : /* Zero-initialize the memory if requested. */
831 1280240 : if ((flags & DSA_ALLOC_ZERO) != 0)
832 631376 : memset(dsa_get_address(area, result), 0, size);
833 :
834 1280240 : return result;
835 : }
836 :
837 : /*
838 : * Free memory obtained with dsa_allocate.
839 : */
840 : void
841 235822 : dsa_free(dsa_area *area, dsa_pointer dp)
842 : {
843 : dsa_segment_map *segment_map;
844 : int pageno;
845 : dsa_pointer span_pointer;
846 : dsa_area_span *span;
847 : char *superblock;
848 : char *object;
849 : size_t size;
850 : int size_class;
851 :
852 : /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
853 235822 : check_for_freed_segments(area);
854 :
855 : /* Locate the object, span and pool. */
856 235822 : segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
857 235822 : pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
858 235822 : span_pointer = segment_map->pagemap[pageno];
859 235822 : span = dsa_get_address(area, span_pointer);
860 235822 : superblock = dsa_get_address(area, span->start);
861 235822 : object = dsa_get_address(area, dp);
862 235822 : size_class = span->size_class;
863 235822 : size = dsa_size_classes[size_class];
864 :
865 : /*
866 : * Special case for large objects that live in a special span: we return
867 : * those pages directly to the free page manager and free the span.
868 : */
869 235822 : if (span->size_class == DSA_SCLASS_SPAN_LARGE)
870 : {
871 :
872 : #ifdef CLOBBER_FREED_MEMORY
873 : memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
874 : #endif
875 :
876 : /* Give pages back to free page manager. */
877 4226 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
878 4226 : FreePageManagerPut(segment_map->fpm,
879 4226 : DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
880 : span->npages);
881 :
882 : /* Move segment to appropriate bin if necessary. */
883 4226 : rebin_segment(area, segment_map);
884 4226 : LWLockRelease(DSA_AREA_LOCK(area));
885 :
886 : /* Unlink span. */
887 4226 : LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
888 : LW_EXCLUSIVE);
889 4226 : unlink_span(area, span);
890 4226 : LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
891 : /* Free the span object so it can be reused. */
892 4226 : dsa_free(area, span_pointer);
893 4226 : return;
894 : }
895 :
896 : #ifdef CLOBBER_FREED_MEMORY
897 : memset(object, 0x7f, size);
898 : #endif
899 :
900 231596 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
901 :
902 : /* Put the object on the span's freelist. */
903 : Assert(object >= superblock);
904 : Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
905 : Assert((object - superblock) % size == 0);
906 231596 : NextFreeObjectIndex(object) = span->firstfree;
907 231596 : span->firstfree = (object - superblock) / size;
908 231596 : ++span->nallocatable;
909 :
910 : /*
911 : * See if the span needs to moved to a different fullness class, or be
912 : * freed so its pages can be given back to the segment.
913 : */
914 231596 : if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
915 : {
916 : /*
917 : * The block was completely full and is located in the
918 : * highest-numbered fullness class, which is never scanned for free
919 : * chunks. We must move it to the next-lower fullness class.
920 : */
921 346 : unlink_span(area, span);
922 346 : add_span_to_fullness_class(area, span, span_pointer,
923 : DSA_FULLNESS_CLASSES - 2);
924 :
925 : /*
926 : * If this is the only span, and there is no active span, then we
927 : * should probably move this span to fullness class 1. (Otherwise if
928 : * you allocate exactly all the objects in the only span, it moves to
929 : * class 3, then you free them all, it moves to 2, and then is given
930 : * back, leaving no active span).
931 : */
932 : }
933 231250 : else if (span->nallocatable == span->nmax &&
934 7664 : (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
935 : {
936 : /*
937 : * This entire block is free, and it's not the active block for this
938 : * size class. Return the memory to the free page manager. We don't
939 : * do this for the active block to prevent hysteresis: if we
940 : * repeatedly allocate and free the only chunk in the active block, it
941 : * will be very inefficient if we deallocate and reallocate the block
942 : * every time.
943 : */
944 18 : destroy_superblock(area, span_pointer);
945 : }
946 :
947 231596 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
948 : }
949 :
950 : /*
951 : * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
952 : * memory allocated by the given area (possibly in another process) that
953 : * hasn't yet been freed. This may cause a segment to be mapped into the
954 : * current process if required, and may cause freed segments to be unmapped.
955 : */
956 : void *
957 18121746 : dsa_get_address(dsa_area *area, dsa_pointer dp)
958 : {
959 : dsa_segment_index index;
960 : size_t offset;
961 :
962 : /* Convert InvalidDsaPointer to NULL. */
963 18121746 : if (!DsaPointerIsValid(dp))
964 2774496 : return NULL;
965 :
966 : /* Process any requests to detach from freed segments. */
967 15347250 : check_for_freed_segments(area);
968 :
969 : /* Break the dsa_pointer into its components. */
970 15347250 : index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
971 15347250 : offset = DSA_EXTRACT_OFFSET(dp);
972 : Assert(index < DSA_MAX_SEGMENTS);
973 :
974 : /* Check if we need to cause this segment to be mapped in. */
975 15347250 : if (unlikely(area->segment_maps[index].mapped_address == NULL))
976 : {
977 : /* Call for effect (we don't need the result). */
978 37250 : get_segment_by_index(area, index);
979 : }
980 :
981 15347250 : return area->segment_maps[index].mapped_address + offset;
982 : }
983 :
984 : /*
985 : * Pin this area, so that it will continue to exist even if all backends
986 : * detach from it. In that case, the area can still be reattached to if a
987 : * handle has been recorded somewhere.
988 : */
989 : void
990 2262 : dsa_pin(dsa_area *area)
991 : {
992 2262 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
993 2262 : if (area->control->pinned)
994 : {
995 0 : LWLockRelease(DSA_AREA_LOCK(area));
996 0 : elog(ERROR, "dsa_area already pinned");
997 : }
998 2262 : area->control->pinned = true;
999 2262 : ++area->control->refcnt;
1000 2262 : LWLockRelease(DSA_AREA_LOCK(area));
1001 2262 : }
1002 :
1003 : /*
1004 : * Undo the effects of dsa_pin, so that the given area can be freed when no
1005 : * backends are attached to it. May be called only if dsa_pin has been
1006 : * called.
1007 : */
1008 : void
1009 0 : dsa_unpin(dsa_area *area)
1010 : {
1011 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1012 : Assert(area->control->refcnt > 1);
1013 0 : if (!area->control->pinned)
1014 : {
1015 0 : LWLockRelease(DSA_AREA_LOCK(area));
1016 0 : elog(ERROR, "dsa_area not pinned");
1017 : }
1018 0 : area->control->pinned = false;
1019 0 : --area->control->refcnt;
1020 0 : LWLockRelease(DSA_AREA_LOCK(area));
1021 0 : }
1022 :
1023 : /*
1024 : * Set the total size limit for this area. This limit is checked whenever new
1025 : * segments need to be allocated from the operating system. If the new size
1026 : * limit is already exceeded, this has no immediate effect.
1027 : *
1028 : * Note that the total virtual memory usage may be temporarily larger than
1029 : * this limit when segments have been freed, but not yet detached by all
1030 : * backends that have attached to them.
1031 : */
1032 : void
1033 4268 : dsa_set_size_limit(dsa_area *area, size_t limit)
1034 : {
1035 4268 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1036 4268 : area->control->max_total_segment_size = limit;
1037 4268 : LWLockRelease(DSA_AREA_LOCK(area));
1038 4268 : }
1039 :
1040 : /* Return the total size of all active segments */
1041 : size_t
1042 2102 : dsa_get_total_size(dsa_area *area)
1043 : {
1044 : size_t size;
1045 :
1046 2102 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1047 2102 : size = area->control->total_segment_size;
1048 2102 : LWLockRelease(DSA_AREA_LOCK(area));
1049 :
1050 2102 : return size;
1051 : }
1052 :
1053 : /*
1054 : * Aggressively free all spare memory in the hope of returning DSM segments to
1055 : * the operating system.
1056 : */
1057 : void
1058 0 : dsa_trim(dsa_area *area)
1059 : {
1060 : int size_class;
1061 :
1062 : /*
1063 : * Trim in reverse pool order so we get to the spans-of-spans last, just
1064 : * in case any become entirely free while processing all the other pools.
1065 : */
1066 0 : for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1067 : {
1068 0 : dsa_area_pool *pool = &area->control->pools[size_class];
1069 : dsa_pointer span_pointer;
1070 :
1071 0 : if (size_class == DSA_SCLASS_SPAN_LARGE)
1072 : {
1073 : /* Large object frees give back segments aggressively already. */
1074 0 : continue;
1075 : }
1076 :
1077 : /*
1078 : * Search fullness class 1 only. That is where we expect to find an
1079 : * entirely empty superblock (entirely empty superblocks in other
1080 : * fullness classes are returned to the free page map by dsa_free).
1081 : */
1082 0 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1083 0 : span_pointer = pool->spans[1];
1084 0 : while (DsaPointerIsValid(span_pointer))
1085 : {
1086 0 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1087 0 : dsa_pointer next = span->nextspan;
1088 :
1089 0 : if (span->nallocatable == span->nmax)
1090 0 : destroy_superblock(area, span_pointer);
1091 :
1092 0 : span_pointer = next;
1093 : }
1094 0 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1095 : }
1096 0 : }
1097 :
1098 : /*
1099 : * Print out debugging information about the internal state of the shared
1100 : * memory area.
1101 : */
1102 : void
1103 0 : dsa_dump(dsa_area *area)
1104 : {
1105 : size_t i,
1106 : j;
1107 :
1108 : /*
1109 : * Note: This gives an inconsistent snapshot as it acquires and releases
1110 : * individual locks as it goes...
1111 : */
1112 :
1113 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1114 0 : check_for_freed_segments_locked(area);
1115 0 : fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1116 0 : fprintf(stderr, " max_total_segment_size: %zu\n",
1117 0 : area->control->max_total_segment_size);
1118 0 : fprintf(stderr, " total_segment_size: %zu\n",
1119 0 : area->control->total_segment_size);
1120 0 : fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1121 0 : fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1122 0 : fprintf(stderr, " segment bins:\n");
1123 0 : for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1124 : {
1125 0 : if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1126 : {
1127 : dsa_segment_index segment_index;
1128 :
1129 0 : if (i == 0)
1130 0 : fprintf(stderr,
1131 : " segment bin %zu (no contiguous free pages):\n", i);
1132 : else
1133 0 : fprintf(stderr,
1134 : " segment bin %zu (at least %d contiguous pages free):\n",
1135 0 : i, 1 << (i - 1));
1136 0 : segment_index = area->control->segment_bins[i];
1137 0 : while (segment_index != DSA_SEGMENT_INDEX_NONE)
1138 : {
1139 : dsa_segment_map *segment_map;
1140 :
1141 : segment_map =
1142 0 : get_segment_by_index(area, segment_index);
1143 :
1144 0 : fprintf(stderr,
1145 : " segment index %zu, usable_pages = %zu, "
1146 : "contiguous_pages = %zu, mapped at %p\n",
1147 : segment_index,
1148 0 : segment_map->header->usable_pages,
1149 0 : fpm_largest(segment_map->fpm),
1150 : segment_map->mapped_address);
1151 0 : segment_index = segment_map->header->next;
1152 : }
1153 : }
1154 : }
1155 0 : LWLockRelease(DSA_AREA_LOCK(area));
1156 :
1157 0 : fprintf(stderr, " pools:\n");
1158 0 : for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1159 : {
1160 0 : bool found = false;
1161 :
1162 0 : LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1163 0 : for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1164 0 : if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1165 0 : found = true;
1166 0 : if (found)
1167 : {
1168 0 : if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1169 0 : fprintf(stderr, " pool for blocks of span objects:\n");
1170 0 : else if (i == DSA_SCLASS_SPAN_LARGE)
1171 0 : fprintf(stderr, " pool for large object spans:\n");
1172 : else
1173 0 : fprintf(stderr,
1174 : " pool for size class %zu (object size %hu bytes):\n",
1175 0 : i, dsa_size_classes[i]);
1176 0 : for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1177 : {
1178 0 : if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1179 0 : fprintf(stderr, " fullness class %zu is empty\n", j);
1180 : else
1181 : {
1182 0 : dsa_pointer span_pointer = area->control->pools[i].spans[j];
1183 :
1184 0 : fprintf(stderr, " fullness class %zu:\n", j);
1185 0 : while (DsaPointerIsValid(span_pointer))
1186 : {
1187 : dsa_area_span *span;
1188 :
1189 0 : span = dsa_get_address(area, span_pointer);
1190 0 : fprintf(stderr,
1191 : " span descriptor at "
1192 : DSA_POINTER_FORMAT ", superblock at "
1193 : DSA_POINTER_FORMAT
1194 : ", pages = %zu, objects free = %hu/%hu\n",
1195 : span_pointer, span->start, span->npages,
1196 0 : span->nallocatable, span->nmax);
1197 0 : span_pointer = span->nextspan;
1198 : }
1199 : }
1200 : }
1201 : }
1202 0 : LWLockRelease(DSA_SCLASS_LOCK(area, i));
1203 : }
1204 0 : }
1205 :
1206 : /*
1207 : * Return the smallest size that you can successfully provide to
1208 : * dsa_create_in_place.
1209 : */
1210 : size_t
1211 3886 : dsa_minimum_size(void)
1212 : {
1213 : size_t size;
1214 3886 : int pages = 0;
1215 :
1216 3886 : size = MAXALIGN(sizeof(dsa_area_control)) +
1217 : MAXALIGN(sizeof(FreePageManager));
1218 :
1219 : /* Figure out how many pages we need, including the page map... */
1220 11658 : while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1221 : {
1222 7772 : ++pages;
1223 7772 : size += sizeof(dsa_pointer);
1224 : }
1225 :
1226 3886 : return pages * FPM_PAGE_SIZE;
1227 : }
1228 :
1229 : /*
1230 : * Workhorse function for dsa_create and dsa_create_in_place.
1231 : */
1232 : static dsa_area *
1233 3168 : create_internal(void *place, size_t size,
1234 : int tranche_id,
1235 : dsm_handle control_handle,
1236 : dsm_segment *control_segment,
1237 : size_t init_segment_size, size_t max_segment_size)
1238 : {
1239 : dsa_area_control *control;
1240 : dsa_area *area;
1241 : dsa_segment_map *segment_map;
1242 : size_t usable_pages;
1243 : size_t total_pages;
1244 : size_t metadata_bytes;
1245 : int i;
1246 :
1247 : /* Check the initial and maximum block sizes */
1248 : Assert(init_segment_size >= DSA_MIN_SEGMENT_SIZE);
1249 : Assert(max_segment_size >= init_segment_size);
1250 : Assert(max_segment_size <= DSA_MAX_SEGMENT_SIZE);
1251 :
1252 : /* Sanity check on the space we have to work in. */
1253 3168 : if (size < dsa_minimum_size())
1254 0 : elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1255 : dsa_minimum_size(), size);
1256 :
1257 : /* Now figure out how much space is usable */
1258 3168 : total_pages = size / FPM_PAGE_SIZE;
1259 3168 : metadata_bytes =
1260 : MAXALIGN(sizeof(dsa_area_control)) +
1261 3168 : MAXALIGN(sizeof(FreePageManager)) +
1262 : total_pages * sizeof(dsa_pointer);
1263 : /* Add padding up to next page boundary. */
1264 3168 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
1265 3168 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1266 : Assert(metadata_bytes <= size);
1267 3168 : usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1268 :
1269 : /*
1270 : * Initialize the dsa_area_control object located at the start of the
1271 : * space.
1272 : */
1273 3168 : control = (dsa_area_control *) place;
1274 3168 : memset(place, 0, sizeof(*control));
1275 3168 : control->segment_header.magic =
1276 3168 : DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1277 3168 : control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1278 3168 : control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1279 3168 : control->segment_header.usable_pages = usable_pages;
1280 3168 : control->segment_header.freed = false;
1281 3168 : control->segment_header.size = size;
1282 3168 : control->handle = control_handle;
1283 3168 : control->init_segment_size = init_segment_size;
1284 3168 : control->max_segment_size = max_segment_size;
1285 3168 : control->max_total_segment_size = (size_t) -1;
1286 3168 : control->total_segment_size = size;
1287 3168 : control->segment_handles[0] = control_handle;
1288 53856 : for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1289 50688 : control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1290 3168 : control->refcnt = 1;
1291 3168 : control->lwlock_tranche_id = tranche_id;
1292 :
1293 : /*
1294 : * Create the dsa_area object that this backend will use to access the
1295 : * area. Other backends will need to obtain their own dsa_area object by
1296 : * attaching.
1297 : */
1298 3168 : area = palloc(sizeof(dsa_area));
1299 3168 : area->control = control;
1300 3168 : area->resowner = CurrentResourceOwner;
1301 3168 : memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1302 3168 : area->high_segment_index = 0;
1303 3168 : area->freed_segment_counter = 0;
1304 3168 : LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1305 123552 : for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1306 120384 : LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1307 : control->lwlock_tranche_id);
1308 :
1309 : /* Set up the segment map for this process's mapping. */
1310 3168 : segment_map = &area->segment_maps[0];
1311 3168 : segment_map->segment = control_segment;
1312 3168 : segment_map->mapped_address = place;
1313 3168 : segment_map->header = (dsa_segment_header *) place;
1314 3168 : segment_map->fpm = (FreePageManager *)
1315 3168 : (segment_map->mapped_address +
1316 : MAXALIGN(sizeof(dsa_area_control)));
1317 3168 : segment_map->pagemap = (dsa_pointer *)
1318 3168 : (segment_map->mapped_address +
1319 3168 : MAXALIGN(sizeof(dsa_area_control)) +
1320 : MAXALIGN(sizeof(FreePageManager)));
1321 :
1322 : /* Set up the free page map. */
1323 3168 : FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1324 : /* There can be 0 usable pages if size is dsa_minimum_size(). */
1325 :
1326 3168 : if (usable_pages > 0)
1327 2450 : FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1328 : usable_pages);
1329 :
1330 : /* Put this segment into the appropriate bin. */
1331 3168 : control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1332 3168 : segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1333 :
1334 3168 : return area;
1335 : }
1336 :
1337 : /*
1338 : * Workhorse function for dsa_attach and dsa_attach_in_place.
1339 : */
1340 : static dsa_area *
1341 48030 : attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1342 : {
1343 : dsa_area_control *control;
1344 : dsa_area *area;
1345 : dsa_segment_map *segment_map;
1346 :
1347 48030 : control = (dsa_area_control *) place;
1348 : Assert(control->handle == handle);
1349 : Assert(control->segment_handles[0] == handle);
1350 : Assert(control->segment_header.magic ==
1351 : (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1352 :
1353 : /* Build the backend-local area object. */
1354 48030 : area = palloc(sizeof(dsa_area));
1355 48030 : area->control = control;
1356 48030 : area->resowner = CurrentResourceOwner;
1357 48030 : memset(&area->segment_maps[0], 0,
1358 : sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1359 48030 : area->high_segment_index = 0;
1360 :
1361 : /* Set up the segment map for this process's mapping. */
1362 48030 : segment_map = &area->segment_maps[0];
1363 48030 : segment_map->segment = segment; /* NULL for in-place */
1364 48030 : segment_map->mapped_address = place;
1365 48030 : segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1366 48030 : segment_map->fpm = (FreePageManager *)
1367 48030 : (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1368 48030 : segment_map->pagemap = (dsa_pointer *)
1369 48030 : (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1370 : MAXALIGN(sizeof(FreePageManager)));
1371 :
1372 : /* Bump the reference count. */
1373 48030 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1374 48030 : if (control->refcnt == 0)
1375 : {
1376 : /* We can't attach to a DSA area that has already been destroyed. */
1377 0 : ereport(ERROR,
1378 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1379 : errmsg("could not attach to dynamic shared area")));
1380 : }
1381 48030 : ++control->refcnt;
1382 48030 : area->freed_segment_counter = area->control->freed_segment_counter;
1383 48030 : LWLockRelease(DSA_AREA_LOCK(area));
1384 :
1385 48030 : return area;
1386 : }
1387 :
1388 : /*
1389 : * Add a new span to fullness class 1 of the indicated pool.
1390 : */
1391 : static void
1392 25906 : init_span(dsa_area *area,
1393 : dsa_pointer span_pointer,
1394 : dsa_area_pool *pool, dsa_pointer start, size_t npages,
1395 : uint16 size_class)
1396 : {
1397 25906 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1398 25906 : size_t obsize = dsa_size_classes[size_class];
1399 :
1400 : /*
1401 : * The per-pool lock must be held because we manipulate the span list for
1402 : * this pool.
1403 : */
1404 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1405 :
1406 : /* Push this span onto the front of the span list for fullness class 1. */
1407 25906 : if (DsaPointerIsValid(pool->spans[1]))
1408 : {
1409 : dsa_area_span *head = (dsa_area_span *)
1410 3950 : dsa_get_address(area, pool->spans[1]);
1411 :
1412 3950 : head->prevspan = span_pointer;
1413 : }
1414 25906 : span->pool = DsaAreaPoolToDsaPointer(area, pool);
1415 25906 : span->nextspan = pool->spans[1];
1416 25906 : span->prevspan = InvalidDsaPointer;
1417 25906 : pool->spans[1] = span_pointer;
1418 :
1419 25906 : span->start = start;
1420 25906 : span->npages = npages;
1421 25906 : span->size_class = size_class;
1422 25906 : span->ninitialized = 0;
1423 25906 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1424 : {
1425 : /*
1426 : * A block-of-spans contains its own descriptor, so mark one object as
1427 : * initialized and reduce the count of allocatable objects by one.
1428 : * Doing this here has the side effect of also reducing nmax by one,
1429 : * which is important to make sure we free this object at the correct
1430 : * time.
1431 : */
1432 2616 : span->ninitialized = 1;
1433 2616 : span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1434 : }
1435 23290 : else if (size_class != DSA_SCLASS_SPAN_LARGE)
1436 17790 : span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1437 25906 : span->firstfree = DSA_SPAN_NOTHING_FREE;
1438 25906 : span->nmax = span->nallocatable;
1439 25906 : span->fclass = 1;
1440 25906 : }
1441 :
1442 : /*
1443 : * Transfer the first span in one fullness class to the head of another
1444 : * fullness class.
1445 : */
1446 : static bool
1447 43102 : transfer_first_span(dsa_area *area,
1448 : dsa_area_pool *pool, int fromclass, int toclass)
1449 : {
1450 : dsa_pointer span_pointer;
1451 : dsa_area_span *span;
1452 : dsa_area_span *nextspan;
1453 :
1454 : /* Can't do it if source list is empty. */
1455 43102 : span_pointer = pool->spans[fromclass];
1456 43102 : if (!DsaPointerIsValid(span_pointer))
1457 40816 : return false;
1458 :
1459 : /* Remove span from head of source list. */
1460 2286 : span = dsa_get_address(area, span_pointer);
1461 2286 : pool->spans[fromclass] = span->nextspan;
1462 2286 : if (DsaPointerIsValid(span->nextspan))
1463 : {
1464 : nextspan = (dsa_area_span *)
1465 148 : dsa_get_address(area, span->nextspan);
1466 148 : nextspan->prevspan = InvalidDsaPointer;
1467 : }
1468 :
1469 : /* Add span to head of target list. */
1470 2286 : span->nextspan = pool->spans[toclass];
1471 2286 : pool->spans[toclass] = span_pointer;
1472 2286 : if (DsaPointerIsValid(span->nextspan))
1473 : {
1474 : nextspan = (dsa_area_span *)
1475 772 : dsa_get_address(area, span->nextspan);
1476 772 : nextspan->prevspan = span_pointer;
1477 : }
1478 2286 : span->fclass = toclass;
1479 :
1480 2286 : return true;
1481 : }
1482 :
1483 : /*
1484 : * Allocate one object of the requested size class from the given area.
1485 : */
1486 : static inline dsa_pointer
1487 1303530 : alloc_object(dsa_area *area, int size_class)
1488 : {
1489 1303530 : dsa_area_pool *pool = &area->control->pools[size_class];
1490 : dsa_area_span *span;
1491 : dsa_pointer block;
1492 : dsa_pointer result;
1493 : char *object;
1494 : size_t size;
1495 :
1496 : /*
1497 : * Even though ensure_active_superblock can in turn call alloc_object if
1498 : * it needs to allocate a new span, that's always from a different pool,
1499 : * and the order of lock acquisition is always the same, so it's OK that
1500 : * we hold this lock for the duration of this function.
1501 : */
1502 : Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1503 1303530 : LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1504 :
1505 : /*
1506 : * If there's no active superblock, we must successfully obtain one or
1507 : * fail the request.
1508 : */
1509 1303530 : if (!DsaPointerIsValid(pool->spans[1]) &&
1510 20622 : !ensure_active_superblock(area, pool, size_class))
1511 : {
1512 0 : result = InvalidDsaPointer;
1513 : }
1514 : else
1515 : {
1516 : /*
1517 : * There should be a block in fullness class 1 at this point, and it
1518 : * should never be completely full. Thus we can either pop an object
1519 : * from the free list or, failing that, initialize a new object.
1520 : */
1521 : Assert(DsaPointerIsValid(pool->spans[1]));
1522 : span = (dsa_area_span *)
1523 1303530 : dsa_get_address(area, pool->spans[1]);
1524 : Assert(span->nallocatable > 0);
1525 1303530 : block = span->start;
1526 : Assert(size_class < DSA_NUM_SIZE_CLASSES);
1527 1303530 : size = dsa_size_classes[size_class];
1528 1303530 : if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1529 : {
1530 193928 : result = block + span->firstfree * size;
1531 193928 : object = dsa_get_address(area, result);
1532 193928 : span->firstfree = NextFreeObjectIndex(object);
1533 : }
1534 : else
1535 : {
1536 1109602 : result = block + span->ninitialized * size;
1537 1109602 : ++span->ninitialized;
1538 : }
1539 1303530 : --span->nallocatable;
1540 :
1541 : /* If it's now full, move it to the highest-numbered fullness class. */
1542 1303530 : if (span->nallocatable == 0)
1543 2074 : transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1544 : }
1545 :
1546 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1547 1303530 : LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1548 :
1549 1303530 : return result;
1550 : }
1551 :
1552 : /*
1553 : * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1554 : * superblocks are completely full and no more can be allocated.
1555 : *
1556 : * Fullness classes K of 0..N are loosely intended to represent blocks whose
1557 : * utilization percentage is at least K/N, but we only enforce this rigorously
1558 : * for the highest-numbered fullness class, which always contains exactly
1559 : * those blocks that are completely full. It's otherwise acceptable for a
1560 : * block to be in a higher-numbered fullness class than the one to which it
1561 : * logically belongs. In addition, the active block, which is always the
1562 : * first block in fullness class 1, is permitted to have a higher allocation
1563 : * percentage than would normally be allowable for that fullness class; we
1564 : * don't move it until it's completely full, and then it goes to the
1565 : * highest-numbered fullness class.
1566 : *
1567 : * It might seem odd that the active block is the head of fullness class 1
1568 : * rather than fullness class 0, but experience with other allocators has
1569 : * shown that it's usually better to allocate from a block that's moderately
1570 : * full rather than one that's nearly empty. Insofar as is reasonably
1571 : * possible, we want to avoid performing new allocations in a block that would
1572 : * otherwise become empty soon.
1573 : */
1574 : static bool
1575 20622 : ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1576 : int size_class)
1577 : {
1578 : dsa_pointer span_pointer;
1579 : dsa_pointer start_pointer;
1580 20622 : size_t obsize = dsa_size_classes[size_class];
1581 : size_t nmax;
1582 : int fclass;
1583 20622 : size_t npages = 1;
1584 : size_t first_page;
1585 : size_t i;
1586 : dsa_segment_map *segment_map;
1587 :
1588 : Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1589 :
1590 : /*
1591 : * Compute the number of objects that will fit in a block of this size
1592 : * class. Span-of-spans blocks are just a single page, and the first
1593 : * object isn't available for use because it describes the block-of-spans
1594 : * itself.
1595 : */
1596 20622 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1597 2616 : nmax = FPM_PAGE_SIZE / obsize - 1;
1598 : else
1599 18006 : nmax = DSA_SUPERBLOCK_SIZE / obsize;
1600 :
1601 : /*
1602 : * If fullness class 1 is empty, try to find a span to put in it by
1603 : * scanning higher-numbered fullness classes (excluding the last one,
1604 : * whose blocks are certain to all be completely full).
1605 : */
1606 41240 : for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1607 : {
1608 20622 : span_pointer = pool->spans[fclass];
1609 :
1610 21044 : while (DsaPointerIsValid(span_pointer))
1611 : {
1612 : int tfclass;
1613 : dsa_area_span *span;
1614 : dsa_area_span *nextspan;
1615 : dsa_area_span *prevspan;
1616 : dsa_pointer next_span_pointer;
1617 :
1618 : span = (dsa_area_span *)
1619 422 : dsa_get_address(area, span_pointer);
1620 422 : next_span_pointer = span->nextspan;
1621 :
1622 : /* Figure out what fullness class should contain this span. */
1623 422 : tfclass = (nmax - span->nallocatable)
1624 422 : * (DSA_FULLNESS_CLASSES - 1) / nmax;
1625 :
1626 : /* Look up next span. */
1627 422 : if (DsaPointerIsValid(span->nextspan))
1628 : nextspan = (dsa_area_span *)
1629 208 : dsa_get_address(area, span->nextspan);
1630 : else
1631 214 : nextspan = NULL;
1632 :
1633 : /*
1634 : * If utilization has dropped enough that this now belongs in some
1635 : * other fullness class, move it there.
1636 : */
1637 422 : if (tfclass < fclass)
1638 : {
1639 : /* Remove from the current fullness class list. */
1640 8 : if (pool->spans[fclass] == span_pointer)
1641 : {
1642 : /* It was the head; remove it. */
1643 : Assert(!DsaPointerIsValid(span->prevspan));
1644 8 : pool->spans[fclass] = span->nextspan;
1645 8 : if (nextspan != NULL)
1646 2 : nextspan->prevspan = InvalidDsaPointer;
1647 : }
1648 : else
1649 : {
1650 : /* It was not the head. */
1651 : Assert(DsaPointerIsValid(span->prevspan));
1652 : prevspan = (dsa_area_span *)
1653 0 : dsa_get_address(area, span->prevspan);
1654 0 : prevspan->nextspan = span->nextspan;
1655 : }
1656 8 : if (nextspan != NULL)
1657 2 : nextspan->prevspan = span->prevspan;
1658 :
1659 : /* Push onto the head of the new fullness class list. */
1660 8 : span->nextspan = pool->spans[tfclass];
1661 8 : pool->spans[tfclass] = span_pointer;
1662 8 : span->prevspan = InvalidDsaPointer;
1663 8 : if (DsaPointerIsValid(span->nextspan))
1664 : {
1665 : nextspan = (dsa_area_span *)
1666 2 : dsa_get_address(area, span->nextspan);
1667 2 : nextspan->prevspan = span_pointer;
1668 : }
1669 8 : span->fclass = tfclass;
1670 : }
1671 :
1672 : /* Advance to next span on list. */
1673 422 : span_pointer = next_span_pointer;
1674 : }
1675 :
1676 : /* Stop now if we found a suitable block. */
1677 20622 : if (DsaPointerIsValid(pool->spans[1]))
1678 4 : return true;
1679 : }
1680 :
1681 : /*
1682 : * If there are no blocks that properly belong in fullness class 1, pick
1683 : * one from some other fullness class and move it there anyway, so that we
1684 : * have an allocation target. Our last choice is to transfer a block
1685 : * that's almost empty (and might become completely empty soon if left
1686 : * alone), but even that is better than failing, which is what we must do
1687 : * if there are no blocks at all with freespace.
1688 : */
1689 : Assert(!DsaPointerIsValid(pool->spans[1]));
1690 41028 : for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1691 20618 : if (transfer_first_span(area, pool, fclass, 1))
1692 208 : return true;
1693 40820 : if (!DsaPointerIsValid(pool->spans[1]) &&
1694 20410 : transfer_first_span(area, pool, 0, 1))
1695 4 : return true;
1696 :
1697 : /*
1698 : * We failed to find an existing span with free objects, so we need to
1699 : * allocate a new superblock and construct a new span to manage it.
1700 : *
1701 : * First, get a dsa_area_span object to describe the new superblock block
1702 : * ... unless this allocation is for a dsa_area_span object, in which case
1703 : * that's surely not going to work. We handle that case by storing the
1704 : * span describing a block-of-spans inline.
1705 : */
1706 20406 : if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1707 : {
1708 17790 : span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1709 17790 : if (!DsaPointerIsValid(span_pointer))
1710 0 : return false;
1711 17790 : npages = DSA_PAGES_PER_SUPERBLOCK;
1712 : }
1713 :
1714 : /* Find or create a segment and allocate the superblock. */
1715 20406 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1716 20406 : segment_map = get_best_segment(area, npages);
1717 20406 : if (segment_map == NULL)
1718 : {
1719 1988 : segment_map = make_new_segment(area, npages);
1720 1988 : if (segment_map == NULL)
1721 : {
1722 0 : LWLockRelease(DSA_AREA_LOCK(area));
1723 0 : return false;
1724 : }
1725 : }
1726 :
1727 : /*
1728 : * This shouldn't happen: get_best_segment() or make_new_segment()
1729 : * promised that we can successfully allocate npages.
1730 : */
1731 20406 : if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1732 0 : elog(FATAL,
1733 : "dsa_allocate could not find %zu free pages for superblock",
1734 : npages);
1735 20406 : LWLockRelease(DSA_AREA_LOCK(area));
1736 :
1737 : /* Compute the start of the superblock. */
1738 20406 : start_pointer =
1739 20406 : DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1740 : first_page * FPM_PAGE_SIZE);
1741 :
1742 : /*
1743 : * If this is a block-of-spans, carve the descriptor right out of the
1744 : * allocated space.
1745 : */
1746 20406 : if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1747 : {
1748 : /*
1749 : * We have a pointer into the segment. We need to build a dsa_pointer
1750 : * from the segment index and offset into the segment.
1751 : */
1752 2616 : span_pointer = start_pointer;
1753 : }
1754 :
1755 : /* Initialize span and pagemap. */
1756 20406 : init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1757 307662 : for (i = 0; i < npages; ++i)
1758 287256 : segment_map->pagemap[first_page + i] = span_pointer;
1759 :
1760 20406 : return true;
1761 : }
1762 :
1763 : /*
1764 : * Return the segment map corresponding to a given segment index, mapping the
1765 : * segment in if necessary. For internal segment book-keeping, this is called
1766 : * with the area lock held. It is also called by dsa_free and dsa_get_address
1767 : * without any locking, relying on the fact they have a known live segment
1768 : * index and they always call check_for_freed_segments to ensures that any
1769 : * freed segment occupying the same slot is detached first.
1770 : */
1771 : static dsa_segment_map *
1772 298862 : get_segment_by_index(dsa_area *area, dsa_segment_index index)
1773 : {
1774 298862 : if (unlikely(area->segment_maps[index].mapped_address == NULL))
1775 : {
1776 : dsm_handle handle;
1777 : dsm_segment *segment;
1778 : dsa_segment_map *segment_map;
1779 : ResourceOwner oldowner;
1780 :
1781 : /*
1782 : * If we are reached by dsa_free or dsa_get_address, there must be at
1783 : * least one object allocated in the referenced segment. Otherwise,
1784 : * their caller has a double-free or access-after-free bug, which we
1785 : * have no hope of detecting. So we know it's safe to access this
1786 : * array slot without holding a lock; it won't change underneath us.
1787 : * Furthermore, we know that we can see the latest contents of the
1788 : * slot, as explained in check_for_freed_segments, which those
1789 : * functions call before arriving here.
1790 : */
1791 37384 : handle = area->control->segment_handles[index];
1792 :
1793 : /* It's an error to try to access an unused slot. */
1794 37384 : if (handle == DSM_HANDLE_INVALID)
1795 0 : elog(ERROR,
1796 : "dsa_area could not attach to a segment that has been freed");
1797 :
1798 37384 : oldowner = CurrentResourceOwner;
1799 37384 : CurrentResourceOwner = area->resowner;
1800 37384 : segment = dsm_attach(handle);
1801 37384 : CurrentResourceOwner = oldowner;
1802 37384 : if (segment == NULL)
1803 0 : elog(ERROR, "dsa_area could not attach to segment");
1804 37384 : segment_map = &area->segment_maps[index];
1805 37384 : segment_map->segment = segment;
1806 37384 : segment_map->mapped_address = dsm_segment_address(segment);
1807 37384 : segment_map->header =
1808 37384 : (dsa_segment_header *) segment_map->mapped_address;
1809 37384 : segment_map->fpm = (FreePageManager *)
1810 37384 : (segment_map->mapped_address +
1811 : MAXALIGN(sizeof(dsa_segment_header)));
1812 37384 : segment_map->pagemap = (dsa_pointer *)
1813 37384 : (segment_map->mapped_address +
1814 37384 : MAXALIGN(sizeof(dsa_segment_header)) +
1815 : MAXALIGN(sizeof(FreePageManager)));
1816 :
1817 : /* Remember the highest index this backend has ever mapped. */
1818 37384 : if (area->high_segment_index < index)
1819 37144 : area->high_segment_index = index;
1820 :
1821 : Assert(segment_map->header->magic ==
1822 : (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1823 : }
1824 :
1825 : /*
1826 : * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1827 : * but it's a bug in the calling code and undefined behavior if the
1828 : * address is not live (ie if the segment might possibly have been freed,
1829 : * they're trying to use a dangling pointer).
1830 : *
1831 : * For dsa.c code that holds the area lock to manipulate segment_bins
1832 : * lists, it would be a bug if we ever reach a freed segment here. After
1833 : * it's marked as freed, the only thing any backend should do with it is
1834 : * unmap it, and it should always have done that in
1835 : * check_for_freed_segments_locked() before arriving here to resolve an
1836 : * index to a segment_map.
1837 : *
1838 : * Either way we can assert that we aren't returning a freed segment.
1839 : */
1840 : Assert(!area->segment_maps[index].header->freed);
1841 :
1842 298862 : return &area->segment_maps[index];
1843 : }
1844 :
1845 : /*
1846 : * Return a superblock to the free page manager. If the underlying segment
1847 : * has become entirely free, then return it to the operating system.
1848 : *
1849 : * The appropriate pool lock must be held.
1850 : */
1851 : static void
1852 18 : destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1853 : {
1854 18 : dsa_area_span *span = dsa_get_address(area, span_pointer);
1855 18 : int size_class = span->size_class;
1856 : dsa_segment_map *segment_map;
1857 :
1858 :
1859 : /* Remove it from its fullness class list. */
1860 18 : unlink_span(area, span);
1861 :
1862 : /*
1863 : * Note: Here we acquire the area lock while we already hold a per-pool
1864 : * lock. We never hold the area lock and then take a pool lock, or we
1865 : * could deadlock.
1866 : */
1867 18 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1868 18 : check_for_freed_segments_locked(area);
1869 : segment_map =
1870 18 : get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1871 18 : FreePageManagerPut(segment_map->fpm,
1872 18 : DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1873 : span->npages);
1874 : /* Check if the segment is now entirely free. */
1875 18 : if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1876 : {
1877 0 : dsa_segment_index index = get_segment_index(area, segment_map);
1878 :
1879 : /* If it's not the segment with extra control data, free it. */
1880 0 : if (index != 0)
1881 : {
1882 : /*
1883 : * Give it back to the OS, and allow other backends to detect that
1884 : * they need to detach.
1885 : */
1886 0 : unlink_segment(area, segment_map);
1887 0 : segment_map->header->freed = true;
1888 : Assert(area->control->total_segment_size >=
1889 : segment_map->header->size);
1890 0 : area->control->total_segment_size -=
1891 0 : segment_map->header->size;
1892 0 : dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1893 0 : dsm_detach(segment_map->segment);
1894 0 : area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1895 0 : ++area->control->freed_segment_counter;
1896 0 : segment_map->segment = NULL;
1897 0 : segment_map->header = NULL;
1898 0 : segment_map->mapped_address = NULL;
1899 : }
1900 : }
1901 :
1902 : /* Move segment to appropriate bin if necessary. */
1903 18 : if (segment_map->header != NULL)
1904 18 : rebin_segment(area, segment_map);
1905 :
1906 18 : LWLockRelease(DSA_AREA_LOCK(area));
1907 :
1908 : /*
1909 : * Span-of-spans blocks store the span which describes them within the
1910 : * block itself, so freeing the storage implicitly frees the descriptor
1911 : * also. If this is a block of any other type, we need to separately free
1912 : * the span object also. This recursive call to dsa_free will acquire the
1913 : * span pool's lock. We can't deadlock because the acquisition order is
1914 : * always some other pool and then the span pool.
1915 : */
1916 18 : if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1917 18 : dsa_free(area, span_pointer);
1918 18 : }
1919 :
1920 : static void
1921 4590 : unlink_span(dsa_area *area, dsa_area_span *span)
1922 : {
1923 4590 : if (DsaPointerIsValid(span->nextspan))
1924 : {
1925 3700 : dsa_area_span *next = dsa_get_address(area, span->nextspan);
1926 :
1927 3700 : next->prevspan = span->prevspan;
1928 : }
1929 4590 : if (DsaPointerIsValid(span->prevspan))
1930 : {
1931 2212 : dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1932 :
1933 2212 : prev->nextspan = span->nextspan;
1934 : }
1935 : else
1936 : {
1937 2378 : dsa_area_pool *pool = dsa_get_address(area, span->pool);
1938 :
1939 2378 : pool->spans[span->fclass] = span->nextspan;
1940 : }
1941 4590 : }
1942 :
1943 : static void
1944 346 : add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1945 : dsa_pointer span_pointer,
1946 : int fclass)
1947 : {
1948 346 : dsa_area_pool *pool = dsa_get_address(area, span->pool);
1949 :
1950 346 : if (DsaPointerIsValid(pool->spans[fclass]))
1951 : {
1952 178 : dsa_area_span *head = dsa_get_address(area,
1953 : pool->spans[fclass]);
1954 :
1955 178 : head->prevspan = span_pointer;
1956 : }
1957 346 : span->prevspan = InvalidDsaPointer;
1958 346 : span->nextspan = pool->spans[fclass];
1959 346 : pool->spans[fclass] = span_pointer;
1960 346 : span->fclass = fclass;
1961 346 : }
1962 :
1963 : /*
1964 : * Detach from an area that was either created or attached to by this process.
1965 : */
1966 : void
1967 50610 : dsa_detach(dsa_area *area)
1968 : {
1969 : int i;
1970 :
1971 : /* Detach from all segments. */
1972 140624 : for (i = 0; i <= area->high_segment_index; ++i)
1973 90014 : if (area->segment_maps[i].segment != NULL)
1974 39474 : dsm_detach(area->segment_maps[i].segment);
1975 :
1976 : /*
1977 : * Note that 'detaching' (= detaching from DSM segments) doesn't include
1978 : * 'releasing' (= adjusting the reference count). It would be nice to
1979 : * combine these operations, but client code might never get around to
1980 : * calling dsa_detach because of an error path, and a detach hook on any
1981 : * particular segment is too late to detach other segments in the area
1982 : * without risking a 'leak' warning in the non-error path.
1983 : */
1984 :
1985 : /* Free the backend-local area object. */
1986 50610 : pfree(area);
1987 50610 : }
1988 :
1989 : /*
1990 : * Unlink a segment from the bin that contains it.
1991 : */
1992 : static void
1993 4540 : unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1994 : {
1995 4540 : if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1996 : {
1997 : dsa_segment_map *prev;
1998 :
1999 2 : prev = get_segment_by_index(area, segment_map->header->prev);
2000 2 : prev->header->next = segment_map->header->next;
2001 : }
2002 : else
2003 : {
2004 : Assert(area->control->segment_bins[segment_map->header->bin] ==
2005 : get_segment_index(area, segment_map));
2006 4538 : area->control->segment_bins[segment_map->header->bin] =
2007 4538 : segment_map->header->next;
2008 : }
2009 4540 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2010 : {
2011 : dsa_segment_map *next;
2012 :
2013 0 : next = get_segment_by_index(area, segment_map->header->next);
2014 0 : next->header->prev = segment_map->header->prev;
2015 : }
2016 4540 : }
2017 :
2018 : /*
2019 : * Find a segment that could satisfy a request for 'npages' of contiguous
2020 : * memory, or return NULL if none can be found. This may involve attaching to
2021 : * segments that weren't previously attached so that we can query their free
2022 : * pages map.
2023 : */
2024 : static dsa_segment_map *
2025 25906 : get_best_segment(dsa_area *area, size_t npages)
2026 : {
2027 : size_t bin;
2028 :
2029 : Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2030 25906 : check_for_freed_segments_locked(area);
2031 :
2032 : /*
2033 : * Start searching from the first bin that *might* have enough contiguous
2034 : * pages.
2035 : */
2036 25906 : for (bin = contiguous_pages_to_segment_bin(npages);
2037 112788 : bin < DSA_NUM_SEGMENT_BINS;
2038 86882 : ++bin)
2039 : {
2040 : /*
2041 : * The minimum contiguous size that any segment in this bin should
2042 : * have. We'll re-bin if we see segments with fewer.
2043 : */
2044 110754 : size_t threshold = (size_t) 1 << (bin - 1);
2045 : dsa_segment_index segment_index;
2046 :
2047 : /* Search this bin for a segment with enough contiguous space. */
2048 110754 : segment_index = area->control->segment_bins[bin];
2049 112626 : while (segment_index != DSA_SEGMENT_INDEX_NONE)
2050 : {
2051 : dsa_segment_map *segment_map;
2052 : dsa_segment_index next_segment_index;
2053 : size_t contiguous_pages;
2054 :
2055 25744 : segment_map = get_segment_by_index(area, segment_index);
2056 25744 : next_segment_index = segment_map->header->next;
2057 25744 : contiguous_pages = fpm_largest(segment_map->fpm);
2058 :
2059 : /* Not enough for the request, still enough for this bin. */
2060 25744 : if (contiguous_pages >= threshold && contiguous_pages < npages)
2061 : {
2062 0 : segment_index = next_segment_index;
2063 0 : continue;
2064 : }
2065 :
2066 : /* Re-bin it if it's no longer in the appropriate bin. */
2067 25744 : if (contiguous_pages < threshold)
2068 : {
2069 4160 : rebin_segment(area, segment_map);
2070 :
2071 : /*
2072 : * But fall through to see if it's enough to satisfy this
2073 : * request anyway....
2074 : */
2075 : }
2076 :
2077 : /* Check if we are done. */
2078 25744 : if (contiguous_pages >= npages)
2079 23872 : return segment_map;
2080 :
2081 : /* Continue searching the same bin. */
2082 1872 : segment_index = next_segment_index;
2083 : }
2084 : }
2085 :
2086 : /* Not found. */
2087 2034 : return NULL;
2088 : }
2089 :
2090 : /*
2091 : * Create a new segment that can handle at least requested_pages. Returns
2092 : * NULL if the requested total size limit or maximum allowed number of
2093 : * segments would be exceeded.
2094 : */
2095 : static dsa_segment_map *
2096 2034 : make_new_segment(dsa_area *area, size_t requested_pages)
2097 : {
2098 : dsa_segment_index new_index;
2099 : size_t metadata_bytes;
2100 : size_t total_size;
2101 : size_t total_pages;
2102 : size_t usable_pages;
2103 : dsa_segment_map *segment_map;
2104 : dsm_segment *segment;
2105 : ResourceOwner oldowner;
2106 :
2107 : Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2108 :
2109 : /* Find a segment slot that is not in use (linearly for now). */
2110 2114 : for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2111 : {
2112 2114 : if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2113 2034 : break;
2114 : }
2115 2034 : if (new_index == DSA_MAX_SEGMENTS)
2116 0 : return NULL;
2117 :
2118 : /*
2119 : * If the total size limit is already exceeded, then we exit early and
2120 : * avoid arithmetic wraparound in the unsigned expressions below.
2121 : */
2122 2034 : if (area->control->total_segment_size >=
2123 2034 : area->control->max_total_segment_size)
2124 0 : return NULL;
2125 :
2126 : /*
2127 : * The size should be at least as big as requested, and at least big
2128 : * enough to follow a geometric series that approximately doubles the
2129 : * total storage each time we create a new segment. We use geometric
2130 : * growth because the underlying DSM system isn't designed for large
2131 : * numbers of segments (otherwise we might even consider just using one
2132 : * DSM segment for each large allocation and for each superblock, and then
2133 : * we wouldn't need to use FreePageManager).
2134 : *
2135 : * We decide on a total segment size first, so that we produce tidy
2136 : * power-of-two sized segments. This is a good property to have if we
2137 : * move to huge pages in the future. Then we work back to the number of
2138 : * pages we can fit.
2139 : */
2140 2034 : total_size = area->control->init_segment_size *
2141 2034 : ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2142 2034 : total_size = Min(total_size, area->control->max_segment_size);
2143 2034 : total_size = Min(total_size,
2144 : area->control->max_total_segment_size -
2145 : area->control->total_segment_size);
2146 :
2147 2034 : total_pages = total_size / FPM_PAGE_SIZE;
2148 2034 : metadata_bytes =
2149 : MAXALIGN(sizeof(dsa_segment_header)) +
2150 2034 : MAXALIGN(sizeof(FreePageManager)) +
2151 : sizeof(dsa_pointer) * total_pages;
2152 :
2153 : /* Add padding up to next page boundary. */
2154 2034 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
2155 2034 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2156 2034 : if (total_size <= metadata_bytes)
2157 0 : return NULL;
2158 2034 : usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2159 : Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2160 :
2161 : /* See if that is enough... */
2162 2034 : if (requested_pages > usable_pages)
2163 : {
2164 : /*
2165 : * We'll make an odd-sized segment, working forward from the requested
2166 : * number of pages.
2167 : */
2168 0 : usable_pages = requested_pages;
2169 0 : metadata_bytes =
2170 : MAXALIGN(sizeof(dsa_segment_header)) +
2171 0 : MAXALIGN(sizeof(FreePageManager)) +
2172 : usable_pages * sizeof(dsa_pointer);
2173 :
2174 : /* Add padding up to next page boundary. */
2175 0 : if (metadata_bytes % FPM_PAGE_SIZE != 0)
2176 0 : metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2177 0 : total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2178 :
2179 : /* Is that too large for dsa_pointer's addressing scheme? */
2180 0 : if (total_size > DSA_MAX_SEGMENT_SIZE)
2181 0 : return NULL;
2182 :
2183 : /* Would that exceed the limit? */
2184 0 : if (total_size > area->control->max_total_segment_size -
2185 0 : area->control->total_segment_size)
2186 0 : return NULL;
2187 : }
2188 :
2189 : /* Create the segment. */
2190 2034 : oldowner = CurrentResourceOwner;
2191 2034 : CurrentResourceOwner = area->resowner;
2192 2034 : segment = dsm_create(total_size, 0);
2193 2034 : CurrentResourceOwner = oldowner;
2194 2034 : if (segment == NULL)
2195 0 : return NULL;
2196 2034 : dsm_pin_segment(segment);
2197 :
2198 : /* Store the handle in shared memory to be found by index. */
2199 4068 : area->control->segment_handles[new_index] =
2200 2034 : dsm_segment_handle(segment);
2201 : /* Track the highest segment index in the history of the area. */
2202 2034 : if (area->control->high_segment_index < new_index)
2203 2034 : area->control->high_segment_index = new_index;
2204 : /* Track the highest segment index this backend has ever mapped. */
2205 2034 : if (area->high_segment_index < new_index)
2206 2034 : area->high_segment_index = new_index;
2207 : /* Track total size of all segments. */
2208 2034 : area->control->total_segment_size += total_size;
2209 : Assert(area->control->total_segment_size <=
2210 : area->control->max_total_segment_size);
2211 :
2212 : /* Build a segment map for this segment in this backend. */
2213 2034 : segment_map = &area->segment_maps[new_index];
2214 2034 : segment_map->segment = segment;
2215 2034 : segment_map->mapped_address = dsm_segment_address(segment);
2216 2034 : segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2217 2034 : segment_map->fpm = (FreePageManager *)
2218 2034 : (segment_map->mapped_address +
2219 : MAXALIGN(sizeof(dsa_segment_header)));
2220 2034 : segment_map->pagemap = (dsa_pointer *)
2221 2034 : (segment_map->mapped_address +
2222 2034 : MAXALIGN(sizeof(dsa_segment_header)) +
2223 : MAXALIGN(sizeof(FreePageManager)));
2224 :
2225 : /* Set up the free page map. */
2226 2034 : FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2227 2034 : FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2228 : usable_pages);
2229 :
2230 : /* Set up the segment header and put it in the appropriate bin. */
2231 2034 : segment_map->header->magic =
2232 2034 : DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2233 2034 : segment_map->header->usable_pages = usable_pages;
2234 2034 : segment_map->header->size = total_size;
2235 2034 : segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2236 2034 : segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2237 2034 : segment_map->header->next =
2238 2034 : area->control->segment_bins[segment_map->header->bin];
2239 2034 : segment_map->header->freed = false;
2240 2034 : area->control->segment_bins[segment_map->header->bin] = new_index;
2241 2034 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2242 : {
2243 : dsa_segment_map *next =
2244 0 : get_segment_by_index(area, segment_map->header->next);
2245 :
2246 : Assert(next->header->bin == segment_map->header->bin);
2247 0 : next->header->prev = new_index;
2248 : }
2249 :
2250 2034 : return segment_map;
2251 : }
2252 :
2253 : /*
2254 : * Check if any segments have been freed by destroy_superblock, so we can
2255 : * detach from them in this backend. This function is called by
2256 : * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2257 : * received can be resolved to the correct segment.
2258 : *
2259 : * The danger we want to defend against is that there could be an old segment
2260 : * mapped into a given slot in this backend, and the dsa_pointer they have
2261 : * might refer to some new segment in the same slot. So those functions must
2262 : * be sure to process all instructions to detach from a freed segment that had
2263 : * been generated by the time this process received the dsa_pointer, before
2264 : * they call get_segment_by_index.
2265 : */
2266 : static void
2267 15583072 : check_for_freed_segments(dsa_area *area)
2268 : {
2269 : size_t freed_segment_counter;
2270 :
2271 : /*
2272 : * Any other process that has freed a segment has incremented
2273 : * freed_segment_counter while holding an LWLock, and that must precede
2274 : * any backend creating a new segment in the same slot while holding an
2275 : * LWLock, and that must precede the creation of any dsa_pointer pointing
2276 : * into the new segment which might reach us here, and the caller must
2277 : * have sent the dsa_pointer to this process using appropriate memory
2278 : * synchronization (some kind of locking or atomic primitive or system
2279 : * call). So all we need to do on the reading side is ask for the load of
2280 : * freed_segment_counter to follow the caller's load of the dsa_pointer it
2281 : * has, and we can be sure to detect any segments that had been freed as
2282 : * of the time that the dsa_pointer reached this process.
2283 : */
2284 15583072 : pg_read_barrier();
2285 15583072 : freed_segment_counter = area->control->freed_segment_counter;
2286 15583072 : if (unlikely(area->freed_segment_counter != freed_segment_counter))
2287 : {
2288 : /* Check all currently mapped segments to find what's been freed. */
2289 0 : LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2290 0 : check_for_freed_segments_locked(area);
2291 0 : LWLockRelease(DSA_AREA_LOCK(area));
2292 : }
2293 15583072 : }
2294 :
2295 : /*
2296 : * Workhorse for check_for_freed_segments(), and also used directly in path
2297 : * where the area lock is already held. This should be called after acquiring
2298 : * the lock but before looking up any segment by index number, to make sure we
2299 : * unmap any stale segments that might have previously had the same index as a
2300 : * current segment.
2301 : */
2302 : static void
2303 25924 : check_for_freed_segments_locked(dsa_area *area)
2304 : {
2305 : size_t freed_segment_counter;
2306 : int i;
2307 :
2308 : Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2309 25924 : freed_segment_counter = area->control->freed_segment_counter;
2310 25924 : if (unlikely(area->freed_segment_counter != freed_segment_counter))
2311 : {
2312 0 : for (i = 0; i <= area->high_segment_index; ++i)
2313 : {
2314 0 : if (area->segment_maps[i].header != NULL &&
2315 0 : area->segment_maps[i].header->freed)
2316 : {
2317 0 : dsm_detach(area->segment_maps[i].segment);
2318 0 : area->segment_maps[i].segment = NULL;
2319 0 : area->segment_maps[i].header = NULL;
2320 0 : area->segment_maps[i].mapped_address = NULL;
2321 : }
2322 : }
2323 0 : area->freed_segment_counter = freed_segment_counter;
2324 : }
2325 25924 : }
2326 :
2327 : /*
2328 : * Re-bin segment if it's no longer in the appropriate bin.
2329 : */
2330 : static void
2331 8404 : rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
2332 : {
2333 : size_t new_bin;
2334 : dsa_segment_index segment_index;
2335 :
2336 8404 : new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2337 8404 : if (segment_map->header->bin == new_bin)
2338 3864 : return;
2339 :
2340 : /* Remove it from its current bin. */
2341 4540 : unlink_segment(area, segment_map);
2342 :
2343 : /* Push it onto the front of its new bin. */
2344 4540 : segment_index = get_segment_index(area, segment_map);
2345 4540 : segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2346 4540 : segment_map->header->next = area->control->segment_bins[new_bin];
2347 4540 : segment_map->header->bin = new_bin;
2348 4540 : area->control->segment_bins[new_bin] = segment_index;
2349 4540 : if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2350 : {
2351 : dsa_segment_map *next;
2352 :
2353 26 : next = get_segment_by_index(area, segment_map->header->next);
2354 : Assert(next->header->bin == new_bin);
2355 26 : next->header->prev = segment_index;
2356 : }
2357 : }
|