diff options
author | Robert Haas | 2015-01-27 03:19:02 +0000 |
---|---|---|
committer | Robert Haas | 2015-01-27 03:19:02 +0000 |
commit | e4912689e1cac24d9908026f0f934804e39cc3f3 (patch) | |
tree | bcd701fd8200e5a34c53a2daa61f3f042aa9828d | |
parent | f5593e4a62ba4579f5a7ec008815fcc3b3b2f2fb (diff) |
Update buffer README; get rid of buf_table.c.chash2014
-rw-r--r-- | src/backend/storage/buffer/Makefile | 2 | ||||
-rw-r--r-- | src/backend/storage/buffer/README | 28 | ||||
-rw-r--r-- | src/backend/storage/buffer/buf_table.c | 131 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 107 | ||||
-rw-r--r-- | src/backend/storage/buffer/freelist.c | 4 | ||||
-rw-r--r-- | src/include/storage/buf_internals.h | 12 |
6 files changed, 85 insertions, 199 deletions
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index 2c10fba9cd..b30a0dac41 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o +OBJS = buf_init.o bufmgr.o freelist.o localbuf.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a4ebbccd48..86697e9825 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -100,30 +100,10 @@ Buffer Manager's Internal Locking Before PostgreSQL 8.1, all operations of the shared buffer manager itself were protected by a single system-wide lock, the BufMgrLock, which -unsurprisingly proved to be a source of contention. The new locking scheme -avoids grabbing system-wide exclusive locks in common code paths. It works -like this: - -* There is a system-wide LWLock, the BufMappingLock, that notionally -protects the mapping from buffer tags (page identifiers) to buffers. -(Physically, it can be thought of as protecting the hash table maintained -by buf_table.c.) To look up whether a buffer exists for a tag, it is -sufficient to obtain share lock on the BufMappingLock. Note that one -must pin the found buffer, if any, before releasing the BufMappingLock. -To alter the page assignment of any buffer, one must hold exclusive lock -on the BufMappingLock. This lock must be held across adjusting the buffer's -header fields and changing the buf_table hash table. The only common -operation that needs exclusive lock is reading in a page that was not -in shared buffers already, which will require at least a kernel call -and usually a wait for I/O, so it will be slow anyway. - -* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS -separate locks, each guarding a portion of the buffer tag space. This allows -further reduction of contention in the normal code paths. The partition -that a particular buffer tag belongs to is determined from the low-order -bits of the tag's hash value. The rules stated above apply to each partition -independently. If it is necessary to lock more than one partition at a time, -they must be locked in partition-number order to avoid risk of deadlock. +unsurprisingly proved to be a source of contention. In subsequent releases, +this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion +of the buffer tag space. Even this proved to be too much contention, so +now we use a highly concurrent hashtable (see chash.c and chash.h). * A separate system-wide spinlock, buffer_strategy_lock, provides mutual exclusion for operations that access the buffer free list or select diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c deleted file mode 100644 index 0840afaa5d..0000000000 --- a/src/backend/storage/buffer/buf_table.c +++ /dev/null @@ -1,131 +0,0 @@ -/*------------------------------------------------------------------------- - * - * buf_table.c - * routines for mapping BufferTags to buffer indexes. - * - * Note: the routines in this file do no locking of their own. The caller - * must hold a suitable lock on the appropriate BufMappingLock, as specified - * in the comments. We can't do the locking inside these functions because - * in most cases the caller needs to adjust the buffer header contents - * before the lock is released (see notes in README). - * - * - * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/storage/buffer/buf_table.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "miscadmin.h" -#include "storage/bufmgr.h" -#include "storage/buf_internals.h" -#include "utils/chash.h" - - -/* entry for buffer lookup hashtable */ -typedef struct -{ - BufferTag key; /* Tag of a disk page */ - int id; /* Associated buffer ID */ -} BufferLookupEnt; - -static CHashDescriptor SharedBufDescriptor = { - "buffer lookup table", - 0, - sizeof(BufferLookupEnt), - sizeof(BufferTag) -}; -static CHashTable SharedBufHash; - -/* - * Estimate space needed for mapping hashtable - * size is the desired hash table size (possibly more than NBuffers) - */ -Size -BufTableShmemSize(int size) -{ - if (SharedBufHash == NULL) - { - SharedBufDescriptor.capacity = size; - SharedBufHash = CHashBootstrap(&SharedBufDescriptor); - } - - return CHashEstimateSize(SharedBufHash); -} - -/* - * Initialize shmem hash table for mapping buffers - * size is the desired hash table size (possibly more than NBuffers) - */ -void -InitBufTable(int size) -{ - if (SharedBufHash == NULL || !IsUnderPostmaster) - { - Assert(SharedBufDescriptor.capacity == 0 || - SharedBufDescriptor.capacity == size); - SharedBufDescriptor.capacity = size; - SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor); - } -} - -/* - * BufTableLookup - * Lookup the given BufferTag; return buffer ID, or -1 if not found - */ -int -BufTableLookup(BufferTag *tagPtr) -{ - BufferLookupEnt ent; - - ent.key = *tagPtr; - if (!CHashSearch(SharedBufHash, &ent)) - return -1; - - return ent.id; -} - -/* - * BufTableInsert - * Insert a hashtable entry for given tag and buffer ID, - * unless an entry already exists for that tag - * - * Returns -1 on successful insertion. If a conflicting entry exists - * already, returns the buffer ID in that entry. - * - * Caller must hold exclusive lock on BufMappingLock for tag's partition - */ -int -BufTableInsert(BufferTag *tagPtr, int buf_id) -{ - BufferLookupEnt ent; - - ent.key = *tagPtr; - ent.id = buf_id; - - Assert(buf_id >= 0); /* -1 is reserved for not-in-table */ - Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ - - if (CHashInsert(SharedBufHash, &ent)) - return -1; - - return ent.id; -} - -/* - * BufTableDelete - * Delete the hashtable entry for given tag (which must exist) - * - * Caller must hold exclusive lock on BufMappingLock for tag's partition - */ -void -BufTableDelete(BufferTag *tagPtr) -{ - if (!CHashDelete(SharedBufHash, tagPtr)) - elog(ERROR, "shared buffer hash table corrupted"); -} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index cbc82bf932..4435b3ebf9 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -24,9 +24,7 @@ * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". * The disk write is delayed until buffer replacement or checkpoint. * - * See also these files: - * freelist.c -- chooses victim for buffer replacement - * buf_table.c -- manages the buffer lookup table + * See also freelist.c, which chooses victim for buffer replacement */ #include "postgres.h" @@ -47,10 +45,25 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" +#include "utils/chash.h" #include "utils/rel.h" #include "utils/resowner_private.h" #include "utils/timestamp.h" +/* entry for buffer lookup hashtable */ +typedef struct +{ + BufferTag key; /* Tag of a disk page */ + int id; /* Associated buffer ID */ +} BufferLookupEnt; + +static CHashDescriptor SharedBufDescriptor = { + "buffer lookup table", + 0, + sizeof(BufferLookupEnt), + sizeof(BufferTag) +}; +static CHashTable SharedBufHash; /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) @@ -138,6 +151,38 @@ static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); /* + * Estimate space needed for mapping hashtable + * size is the desired hash table size (possibly more than NBuffers) + */ +Size +BufMgrShmemSize(int size) +{ + if (SharedBufHash == NULL) + { + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashBootstrap(&SharedBufDescriptor); + } + + return CHashEstimateSize(SharedBufHash); +} + +/* + * Initialize shmem hash table for mapping buffers + * size is the desired hash table size (possibly more than NBuffers) + */ +void +BufMgrInitShmem(int size) +{ + if (SharedBufHash == NULL || !IsUnderPostmaster) + { + Assert(SharedBufDescriptor.capacity == 0 || + SharedBufDescriptor.capacity == size); + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor); + } +} + +/* * Ensure that the the PrivateRefCountArray has sufficient space to store one * more entry. This has to be called before using NewPrivateRefCountEntry() to * fill a new entry - but it's perfectly fine to not use a reserved entry. @@ -444,18 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) } else { - BufferTag newTag; /* identity of requested block */ - int buf_id; + BufferLookupEnt ent; /* identity of requested block */ /* create a tag so we can lookup the buffer */ - INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node, + INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node, forkNum, blockNum); - /* see if the block is in the buffer pool already */ - buf_id = BufTableLookup(&newTag); - /* If not in buffers, initiate prefetch */ - if (buf_id < 0) + if (!CHashSearch(SharedBufHash, &ent)) smgrprefetch(reln->rd_smgr, forkNum, blockNum); /* @@ -862,20 +903,18 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BufferAccessStrategy strategy, bool *foundPtr) { - BufferTag newTag; /* identity of requested block */ - BufferTag oldTag; /* previous identity of selected buffer */ + BufferLookupEnt newEnt; /* identity of requested block */ + BufferLookupEnt oldEnt; /* previous identity of selected buffer */ BufFlags oldFlags; - int buf_id; volatile BufferDesc *buf; bool valid; /* create a tag so we can lookup the buffer */ - INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); + INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum); /* see if the block is in the buffer pool already */ start: - buf_id = BufTableLookup(&newTag); - if (buf_id >= 0) + if (CHashSearch(SharedBufHash, &newEnt)) { BufferDesc *foundbuf; @@ -883,12 +922,12 @@ start: * Found it. Now, pin the buffer so no one can steal it from the * buffer pool. */ - foundbuf = &BufferDescriptors[buf_id]; + foundbuf = &BufferDescriptors[newEnt.id]; valid = PinBuffer(foundbuf, strategy); /* Check whether someone recycled the buffer before we pinned it. */ - if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag)) + if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag)) { UnpinBuffer(foundbuf, true); goto start; @@ -1026,7 +1065,7 @@ start: if (oldFlags & BM_TAG_VALID) { /* Save old tag. */ - oldTag = buf->tag; + oldEnt.key = buf->tag; } /* @@ -1037,9 +1076,8 @@ start: * tag. */ enter: - buf_id = BufTableInsert(&newTag, buf->buf_id); - - if (buf_id >= 0) + newEnt.id = buf->buf_id; + if (!CHashInsert(SharedBufHash, &newEnt)) { BufferDesc *foundbuf; @@ -1050,9 +1088,9 @@ enter: * recheck the buffer tag after pinning it, because it could still * get renamed under us. */ - foundbuf = &BufferDescriptors[buf_id]; + foundbuf = &BufferDescriptors[newEnt.id]; valid = PinBuffer(foundbuf, strategy); - if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag)) + if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag)) { UnpinBuffer(foundbuf, true); goto enter; @@ -1104,7 +1142,8 @@ enter: break; UnlockBufHdr(buf); - BufTableDelete(&newTag); + if (!CHashDelete(SharedBufHash, &newEnt.key)) + elog(ERROR, "shared buffer hash table corrupted"); UnpinBuffer(buf, true); } @@ -1116,7 +1155,7 @@ enter: * the old content is no longer relevant. (The usage_count starts out at * 1 so that the buffer can survive one clock-sweep pass.) */ - buf->tag = newTag; + buf->tag = newEnt.key; buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT); if (relpersistence == RELPERSISTENCE_PERMANENT) buf->flags |= BM_TAG_VALID | BM_PERMANENT; @@ -1126,8 +1165,9 @@ enter: UnlockBufHdr(buf); - if (oldFlags & BM_TAG_VALID) - BufTableDelete(&oldTag); + if ((oldFlags & BM_TAG_VALID) != 0 && + !CHashDelete(SharedBufHash, &oldEnt)) + elog(ERROR, "shared buffer hash table corrupted"); /* * Buffer contents are currently invalid. Try to get the io_in_progress @@ -1162,11 +1202,11 @@ enter: static void InvalidateBuffer(volatile BufferDesc *buf) { - BufferTag oldTag; + BufferLookupEnt oldEnt; BufFlags oldFlags; /* Save the original buffer tag before dropping the spinlock */ - oldTag = buf->tag; + oldEnt.key = buf->tag; /* * We assume the only reason for it to be pinned is that someone else is @@ -1187,7 +1227,7 @@ InvalidateBuffer(volatile BufferDesc *buf) LockBufHdr(buf); /* If it's changed while we were waiting for lock, do nothing */ - if (!BUFFERTAGS_EQUAL(buf->tag, oldTag)) + if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key)) { UnlockBufHdr(buf); return; @@ -1208,8 +1248,9 @@ InvalidateBuffer(volatile BufferDesc *buf) /* * Remove the buffer from the lookup hashtable, if it was in there. */ - if (oldFlags & BM_TAG_VALID) - BufTableDelete(&oldTag); + if ((oldFlags & BM_TAG_VALID) != 0 && + !CHashDelete(SharedBufHash, &oldEnt)) + elog(ERROR, "shared buffer hash table corrupted"); /* * Insert the buffer at the head of the list of free buffers. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 3add619b5d..2410dfc272 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -432,7 +432,7 @@ StrategyShmemSize(void) Size size = 0; /* size of lookup hash table ... see comment in StrategyInitialize */ - size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); + size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); /* size of the shared replacement strategy control block */ size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); @@ -462,7 +462,7 @@ StrategyInitialize(bool init) * happening in each partition concurrently, so we could need as many as * NBuffers + NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 4b1696cf77..b58af88451 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -182,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -/* buf_table.c */ -extern Size BufTableShmemSize(int size); -extern void InitBufTable(int size); -extern uint32 BufTableHashCode(BufferTag *tagPtr); -extern int BufTableLookup(BufferTag *tagPtr); -extern int BufTableInsert(BufferTag *tagPtr, int buf_id); -extern void BufTableDelete(BufferTag *tagPtr); - /* localbuf.c */ extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum); @@ -201,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); extern void AtEOXact_LocalBuffers(bool isCommit); +/* bufmgr.c */ +extern Size BufMgrShmemSize(int size); +extern void BufMgrInitShmem(int size); + #endif /* BUFMGR_INTERNALS_H */ |