From faeedbcefd40bfdf314e048c425b6d9208896d90 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 8 Apr 2023 10:38:09 +1200 Subject: [PATCH] Introduce PG_IO_ALIGN_SIZE and align all I/O buffers. In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a later commit, we need the addresses of user space buffers to be well aligned. The exact requirements vary by OS and file system (typically sectors and/or memory pages). The address alignment size is set to 4096, which is enough for currently known systems: it matches modern sectors and common memory page size. There is no standard governing O_DIRECT's requirements so we might eventually have to reconsider this with more information from the field or future systems. Aligning I/O buffers on memory pages is also known to improve regular buffered I/O performance. Three classes of I/O buffers for regular data pages are adjusted: (1) Heap buffers are now allocated with the new palloc_aligned() or MemoryContextAllocAligned() functions introduced by commit 439f6175. (2) Stack buffers now use a new struct PGIOAlignedBlock to respect PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer pool is also aligned in shared memory. WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus for O_DIRECT WAL writes to fail to be well aligned, but that's a pre-existing condition and will be addressed by a later commit. BufFiles are not yet addressed (there's no current plan to use O_DIRECT for those, but they could potentially get some incidental speedup even in plain buffered I/O operations through better alignment). If we can't align stack objects suitably using the compiler extensions we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to 0. This avoids the need to consider systems that have O_DIRECT but can't align stack objects the way we want; such systems could in theory be supported with more work but we don't currently know of any such machines, so it's easier to pretend there is no O_DIRECT support instead. That's an existing and tested class of system. Add assertions that all buffers passed into smgrread(), smgrwrite() and smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack alignment tricks may be unavailable) or the block size has been set too small to allow arrays of buffers to be all aligned. Author: Thomas Munro Author: Andres Freund Reviewed-by: Justin Pryzby Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com --- contrib/bloom/blinsert.c | 2 +- contrib/pg_prewarm/pg_prewarm.c | 2 +- src/backend/access/gist/gistbuild.c | 9 +++--- src/backend/access/hash/hashpage.c | 2 +- src/backend/access/heap/rewriteheap.c | 2 +- src/backend/access/nbtree/nbtree.c | 2 +- src/backend/access/nbtree/nbtsort.c | 8 ++++-- src/backend/access/spgist/spginsert.c | 2 +- src/backend/access/transam/generic_xlog.c | 13 ++++++--- src/backend/access/transam/xlog.c | 2 +- src/backend/catalog/storage.c | 2 +- src/backend/storage/buffer/buf_init.c | 10 +++++-- src/backend/storage/buffer/bufmgr.c | 2 +- src/backend/storage/buffer/localbuf.c | 7 +++-- src/backend/storage/file/buffile.c | 6 ++++ src/backend/storage/page/bufpage.c | 5 +++- src/backend/storage/smgr/md.c | 15 +++++++++- src/backend/utils/sort/logtape.c | 2 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_rewind/local_source.c | 4 +-- src/bin/pg_upgrade/file.c | 4 +-- src/common/file_utils.c | 4 +-- src/include/c.h | 34 +++++++++++++++++------ src/include/pg_config_manual.h | 6 ++++ src/include/storage/fd.h | 5 ++-- src/tools/pgindent/typedefs.list | 1 + 26 files changed, 108 insertions(+), 45 deletions(-) diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index dcd8120895..b42b9e6c41 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -166,7 +166,7 @@ blbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); BloomFillMetapage(index, metapage); /* diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 54209924ae..e464d0d4d2 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -36,7 +36,7 @@ typedef enum PREWARM_BUFFER } PrewarmType; -static PGAlignedBlock blockbuffer; +static PGIOAlignedBlock blockbuffer; /* * pg_prewarm(regclass, mode text, fork text, diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index d2f8da5b02..5e0c1447f9 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state) * Write an empty page as a placeholder for the root page. It will be * replaced with the real root page at the end. */ - page = palloc0(BLCKSZ); + page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, page, true); state->pages_allocated++; @@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, levelstate->current_page++; if (levelstate->pages[levelstate->current_page] == NULL) - levelstate->pages[levelstate->current_page] = palloc(BLCKSZ); + levelstate->pages[levelstate->current_page] = + palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc0(BLCKSZ); + target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc(BLCKSZ); + parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); parent->parent = NULL; gistinitpage(parent->pages[0], 0); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 6d8af42260..af3a154266 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -992,7 +992,7 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; Page page; HashPageOpaque ovflopaque; diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index ae0282a70e..424958912c 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc(BLCKSZ); + state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 992f84834f..2df8849858 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -154,7 +154,7 @@ btbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 1207a49689..6ad3f3c54d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level) Page page; BTPageOpaque opaque; - page = (Page) palloc(BLCKSZ); + page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); @@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ, + PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); /* don't set checksum for all-zero page */ smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_written++, @@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->allequalimage); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 718a88335d..72d2e1551c 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -158,7 +158,7 @@ spgbuildempty(Relation index) Page page; /* Construct metapage. */ - page = (Page) palloc(BLCKSZ); + page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); SpGistInitMetapage(page); /* diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 9f67d1c1cd..6c68191ca6 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -58,14 +58,17 @@ typedef struct char delta[MAX_DELTA_SIZE]; /* delta between page images */ } PageData; -/* State of generic xlog record construction */ +/* + * State of generic xlog record construction. Must be allocated at an I/O + * aligned address. + */ struct GenericXLogState { + /* Page images (properly aligned, must be first) */ + PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; /* Info about each page, see above */ PageData pages[MAX_GENERIC_XLOG_PAGES]; bool isLogged; - /* Page images (properly aligned) */ - PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; }; static void writeFragment(PageData *pageData, OffsetNumber offset, @@ -269,7 +272,9 @@ GenericXLogStart(Relation relation) GenericXLogState *state; int i; - state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); + state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState), + PG_IO_ALIGN_SIZE, + 0); state->isLogged = RelationNeedsWAL(relation); for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 46821ad605..a5c74fdab8 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4506,7 +4506,7 @@ XLOGShmemSize(void) /* xlblocks array */ size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); /* extra alignment padding for XLOG I/O buffers */ - size = add_size(size, XLOG_BLCKSZ); + size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE)); /* and the buffers themselves */ size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index af1491aa1d..2add053489 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -451,7 +451,7 @@ void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence) { - PGAlignedBlock buf; + PGIOAlignedBlock buf; Page page; bool use_wal; bool copying_initfork; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 20946c47cb..0057443f0c 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -78,9 +78,12 @@ InitBufferPool(void) NBuffers * sizeof(BufferDescPadded), &foundDescs); + /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) - ShmemInitStruct("Buffer Blocks", - NBuffers * (Size) BLCKSZ, &foundBufs); + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStruct("Buffer Blocks", + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) @@ -163,7 +166,8 @@ BufferShmemSize(void) /* to allow aligning buffer descriptors */ size = add_size(size, PG_CACHE_LINE_SIZE); - /* size of data pages */ + /* size of data pages, plus alignment padding */ + size = add_size(size, PG_IO_ALIGN_SIZE); size = add_size(size, mul_size(NBuffers, BLCKSZ)); /* size of stuff controlled by freelist.c */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a12d0c6c27..5a237d5606 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4250,7 +4250,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, bool use_wal; BlockNumber nblocks; BlockNumber blkno; - PGAlignedBlock buf; + PGIOAlignedBlock buf; BufferAccessStrategy bstrategy_src; BufferAccessStrategy bstrategy_dst; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3d5bc9193d..3c6382456a 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -744,8 +744,11 @@ GetLocalBufferStorage(void) /* And don't overflow MaxAllocSize, either */ num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); - cur_block = (char *) MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ); + /* Buffers should be I/O aligned. */ + cur_block = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + MemoryContextAlloc(LocalBufferContext, + num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); next_buf_in_block = 0; num_bufs_in_block = num_bufs; } diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 37ea8ac6b7..84ead85942 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -95,6 +95,12 @@ struct BufFile off_t curOffset; /* offset part of current pos */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ + + /* + * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid + * wasting per-file alignment padding when some users create many + * files. + */ PGAlignedBlock buffer; }; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 92994f8f39..9a302ddc30 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1522,7 +1522,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) * and second to avoid wasting space in processes that never call this. */ if (pageCopy == NULL) - pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + pageCopy = MemoryContextAllocAligned(TopMemoryContext, + BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d9d0367c89..d1124d46f4 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -453,6 +453,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum >= mdnblocks(reln, forknum)); @@ -783,6 +787,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, @@ -848,6 +856,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum < mdnblocks(reln, forknum)); @@ -1429,7 +1441,8 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ if (nblocks < ((BlockNumber) RELSEG_SIZE)) { - char *zerobuf = palloc0(BLCKSZ); + char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); mdextend(reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 64ea237438..52b8898d5e 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -252,7 +252,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, const void *buffer) */ while (blocknum > lts->nBlocksWritten) { - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; MemSet(zerobuf.data, 0, sizeof(zerobuf)); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index aa21007497..19eb67e485 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -183,7 +183,7 @@ skipfile(const char *fn) static void scan_file(const char *fn, int segmentno) { - PGAlignedBlock buf; + PGIOAlignedBlock buf; PageHeader header = (PageHeader) buf.data; int f; BlockNumber blockno; diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c index da9d75dccb..4e2a1376c6 100644 --- a/src/bin/pg_rewind/local_source.c +++ b/src/bin/pg_rewind/local_source.c @@ -77,7 +77,7 @@ static void local_queue_fetch_file(rewind_source *source, const char *path, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; size_t written_len; @@ -129,7 +129,7 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; off_t begin = off; diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index ed874507ff..d173602882 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -178,8 +178,8 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, { int src_fd; int dst_fd; - PGAlignedBlock buffer; - PGAlignedBlock new_vmbuf; + PGIOAlignedBlock buffer; + PGIOAlignedBlock new_vmbuf; ssize_t totalBytesRead = 0; ssize_t src_filesize; int rewriteVmBytesPerPage; diff --git a/src/common/file_utils.c b/src/common/file_utils.c index d568d83b9f..74833c4acb 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -540,8 +540,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset) { - static const PGAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ - void *zerobuf_addr = unconstify(PGAlignedBlock *, &zbuffer)->data; + static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ + void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; struct iovec iov[PG_IOV_MAX]; size_t remaining_size = size; ssize_t total_written = 0; diff --git a/src/include/c.h b/src/include/c.h index 5fe7a97ff0..f69d739be5 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1119,14 +1119,11 @@ extern void ExceptionalCondition(const char *conditionName, /* * Use this, not "char buf[BLCKSZ]", to declare a field or local variable - * holding a page buffer, if that page might be accessed as a page and not - * just a string of bytes. Otherwise the variable might be under-aligned, - * causing problems on alignment-picky hardware. (In some places, we use - * this to declare buffers even though we only pass them to read() and - * write(), because copying to/from aligned buffers is usually faster than - * using unaligned buffers.) We include both "double" and "int64" in the - * union to ensure that the compiler knows the value must be MAXALIGN'ed - * (cf. configure's computation of MAXIMUM_ALIGNOF). + * holding a page buffer, if that page might be accessed as a page. Otherwise + * the variable might be under-aligned, causing problems on alignment-picky + * hardware. We include both "double" and "int64" in the union to ensure that + * the compiler knows the value must be MAXALIGN'ed (cf. configure's + * computation of MAXIMUM_ALIGNOF). */ typedef union PGAlignedBlock { @@ -1135,9 +1132,30 @@ typedef union PGAlignedBlock int64 force_align_i64; } PGAlignedBlock; +/* + * Use this to declare a field or local variable holding a page buffer, if that + * page might be accessed as a page or passed to an SMgr I/O function. If + * allocating using the MemoryContext API, the aligned allocation functions + * should be used with PG_IO_ALIGN_SIZE. This alignment may be more efficient + * for I/O in general, but may be strictly required on some platforms when + * using direct I/O. + */ +typedef union PGIOAlignedBlock +{ +#ifdef pg_attribute_aligned + pg_attribute_aligned(PG_IO_ALIGN_SIZE) +#endif + char data[BLCKSZ]; + double force_align_d; + int64 force_align_i64; +} PGIOAlignedBlock; + /* Same, but for an XLOG_BLCKSZ-sized buffer */ typedef union PGAlignedXLogBlock { +#ifdef pg_attribute_aligned + pg_attribute_aligned(PG_IO_ALIGN_SIZE) +#endif char data[XLOG_BLCKSZ]; double force_align_d; int64 force_align_i64; diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index b586ee269a..33ec6102c1 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -227,6 +227,12 @@ */ #define PG_CACHE_LINE_SIZE 128 +/* + * Assumed alignment requirement for direct I/O. 4K corresponds to common + * sector and memory page size. + */ +#define PG_IO_ALIGN_SIZE 4096 + /* *------------------------------------------------------------------------ * The following symbols are for enabling debugging code, not for diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index daceafd473..faac4914fe 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -82,9 +82,10 @@ extern PGDLLIMPORT int max_safe_fds; * to the appropriate Windows flag in src/port/open.c. We simulate it with * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good - * idea on a Unix). + * idea on a Unix). We can only use it if the compiler will correctly align + * PGIOAlignedBlock for us, though. */ -#if defined(O_DIRECT) +#if defined(O_DIRECT) && defined(pg_attribute_aligned) #define PG_O_DIRECT O_DIRECT #elif defined(F_NOCACHE) #define PG_O_DIRECT 0x80000000 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 494cc66d5b..df960883c5 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1703,6 +1703,7 @@ PGEventResultDestroy PGFInfoFunction PGFileType PGFunction +PGIOAlignedBlock PGLZ_HistEntry PGLZ_Strategy PGLoadBalanceType -- 2.39.5