summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Haas2015-01-27 03:19:02 +0000
committerRobert Haas2015-01-27 03:19:02 +0000
commite4912689e1cac24d9908026f0f934804e39cc3f3 (patch)
treebcd701fd8200e5a34c53a2daa61f3f042aa9828d
parentf5593e4a62ba4579f5a7ec008815fcc3b3b2f2fb (diff)
Update buffer README; get rid of buf_table.c.chash2014
-rw-r--r--src/backend/storage/buffer/Makefile2
-rw-r--r--src/backend/storage/buffer/README28
-rw-r--r--src/backend/storage/buffer/buf_table.c131
-rw-r--r--src/backend/storage/buffer/bufmgr.c107
-rw-r--r--src/backend/storage/buffer/freelist.c4
-rw-r--r--src/include/storage/buf_internals.h12
6 files changed, 85 insertions, 199 deletions
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
index 2c10fba9cd..b30a0dac41 100644
--- a/src/backend/storage/buffer/Makefile
+++ b/src/backend/storage/buffer/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o
+OBJS = buf_init.o bufmgr.o freelist.o localbuf.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index a4ebbccd48..86697e9825 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -100,30 +100,10 @@ Buffer Manager's Internal Locking
Before PostgreSQL 8.1, all operations of the shared buffer manager itself
were protected by a single system-wide lock, the BufMgrLock, which
-unsurprisingly proved to be a source of contention. The new locking scheme
-avoids grabbing system-wide exclusive locks in common code paths. It works
-like this:
-
-* There is a system-wide LWLock, the BufMappingLock, that notionally
-protects the mapping from buffer tags (page identifiers) to buffers.
-(Physically, it can be thought of as protecting the hash table maintained
-by buf_table.c.) To look up whether a buffer exists for a tag, it is
-sufficient to obtain share lock on the BufMappingLock. Note that one
-must pin the found buffer, if any, before releasing the BufMappingLock.
-To alter the page assignment of any buffer, one must hold exclusive lock
-on the BufMappingLock. This lock must be held across adjusting the buffer's
-header fields and changing the buf_table hash table. The only common
-operation that needs exclusive lock is reading in a page that was not
-in shared buffers already, which will require at least a kernel call
-and usually a wait for I/O, so it will be slow anyway.
-
-* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
-separate locks, each guarding a portion of the buffer tag space. This allows
-further reduction of contention in the normal code paths. The partition
-that a particular buffer tag belongs to is determined from the low-order
-bits of the tag's hash value. The rules stated above apply to each partition
-independently. If it is necessary to lock more than one partition at a time,
-they must be locked in partition-number order to avoid risk of deadlock.
+unsurprisingly proved to be a source of contention. In subsequent releases,
+this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion
+of the buffer tag space. Even this proved to be too much contention, so
+now we use a highly concurrent hashtable (see chash.c and chash.h).
* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
exclusion for operations that access the buffer free list or select
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
deleted file mode 100644
index 0840afaa5d..0000000000
--- a/src/backend/storage/buffer/buf_table.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * buf_table.c
- * routines for mapping BufferTags to buffer indexes.
- *
- * Note: the routines in this file do no locking of their own. The caller
- * must hold a suitable lock on the appropriate BufMappingLock, as specified
- * in the comments. We can't do the locking inside these functions because
- * in most cases the caller needs to adjust the buffer header contents
- * before the lock is released (see notes in README).
- *
- *
- * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * src/backend/storage/buffer/buf_table.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "miscadmin.h"
-#include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-#include "utils/chash.h"
-
-
-/* entry for buffer lookup hashtable */
-typedef struct
-{
- BufferTag key; /* Tag of a disk page */
- int id; /* Associated buffer ID */
-} BufferLookupEnt;
-
-static CHashDescriptor SharedBufDescriptor = {
- "buffer lookup table",
- 0,
- sizeof(BufferLookupEnt),
- sizeof(BufferTag)
-};
-static CHashTable SharedBufHash;
-
-/*
- * Estimate space needed for mapping hashtable
- * size is the desired hash table size (possibly more than NBuffers)
- */
-Size
-BufTableShmemSize(int size)
-{
- if (SharedBufHash == NULL)
- {
- SharedBufDescriptor.capacity = size;
- SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
- }
-
- return CHashEstimateSize(SharedBufHash);
-}
-
-/*
- * Initialize shmem hash table for mapping buffers
- * size is the desired hash table size (possibly more than NBuffers)
- */
-void
-InitBufTable(int size)
-{
- if (SharedBufHash == NULL || !IsUnderPostmaster)
- {
- Assert(SharedBufDescriptor.capacity == 0 ||
- SharedBufDescriptor.capacity == size);
- SharedBufDescriptor.capacity = size;
- SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
- }
-}
-
-/*
- * BufTableLookup
- * Lookup the given BufferTag; return buffer ID, or -1 if not found
- */
-int
-BufTableLookup(BufferTag *tagPtr)
-{
- BufferLookupEnt ent;
-
- ent.key = *tagPtr;
- if (!CHashSearch(SharedBufHash, &ent))
- return -1;
-
- return ent.id;
-}
-
-/*
- * BufTableInsert
- * Insert a hashtable entry for given tag and buffer ID,
- * unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion. If a conflicting entry exists
- * already, returns the buffer ID in that entry.
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-int
-BufTableInsert(BufferTag *tagPtr, int buf_id)
-{
- BufferLookupEnt ent;
-
- ent.key = *tagPtr;
- ent.id = buf_id;
-
- Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
- Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
-
- if (CHashInsert(SharedBufHash, &ent))
- return -1;
-
- return ent.id;
-}
-
-/*
- * BufTableDelete
- * Delete the hashtable entry for given tag (which must exist)
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-void
-BufTableDelete(BufferTag *tagPtr)
-{
- if (!CHashDelete(SharedBufHash, tagPtr))
- elog(ERROR, "shared buffer hash table corrupted");
-}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cbc82bf932..4435b3ebf9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -24,9 +24,7 @@
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
* The disk write is delayed until buffer replacement or checkpoint.
*
- * See also these files:
- * freelist.c -- chooses victim for buffer replacement
- * buf_table.c -- manages the buffer lookup table
+ * See also freelist.c, which chooses victim for buffer replacement
*/
#include "postgres.h"
@@ -47,10 +45,25 @@
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
+#include "utils/chash.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+ BufferTag key; /* Tag of a disk page */
+ int id; /* Associated buffer ID */
+} BufferLookupEnt;
+
+static CHashDescriptor SharedBufDescriptor = {
+ "buffer lookup table",
+ 0,
+ sizeof(BufferLookupEnt),
+ sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -138,6 +151,38 @@ static inline int32 GetPrivateRefCount(Buffer buffer);
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
/*
+ * Estimate space needed for mapping hashtable
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufMgrShmemSize(int size)
+{
+ if (SharedBufHash == NULL)
+ {
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+ }
+
+ return CHashEstimateSize(SharedBufHash);
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+BufMgrInitShmem(int size)
+{
+ if (SharedBufHash == NULL || !IsUnderPostmaster)
+ {
+ Assert(SharedBufDescriptor.capacity == 0 ||
+ SharedBufDescriptor.capacity == size);
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+ }
+}
+
+/*
* Ensure that the the PrivateRefCountArray has sufficient space to store one
* more entry. This has to be called before using NewPrivateRefCountEntry() to
* fill a new entry - but it's perfectly fine to not use a reserved entry.
@@ -444,18 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
}
else
{
- BufferTag newTag; /* identity of requested block */
- int buf_id;
+ BufferLookupEnt ent; /* identity of requested block */
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
+ INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
- /* see if the block is in the buffer pool already */
- buf_id = BufTableLookup(&newTag);
-
/* If not in buffers, initiate prefetch */
- if (buf_id < 0)
+ if (!CHashSearch(SharedBufHash, &ent))
smgrprefetch(reln->rd_smgr, forkNum, blockNum);
/*
@@ -862,20 +903,18 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BufferAccessStrategy strategy,
bool *foundPtr)
{
- BufferTag newTag; /* identity of requested block */
- BufferTag oldTag; /* previous identity of selected buffer */
+ BufferLookupEnt newEnt; /* identity of requested block */
+ BufferLookupEnt oldEnt; /* previous identity of selected buffer */
BufFlags oldFlags;
- int buf_id;
volatile BufferDesc *buf;
bool valid;
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+ INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum);
/* see if the block is in the buffer pool already */
start:
- buf_id = BufTableLookup(&newTag);
- if (buf_id >= 0)
+ if (CHashSearch(SharedBufHash, &newEnt))
{
BufferDesc *foundbuf;
@@ -883,12 +922,12 @@ start:
* Found it. Now, pin the buffer so no one can steal it from the
* buffer pool.
*/
- foundbuf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[newEnt.id];
valid = PinBuffer(foundbuf, strategy);
/* Check whether someone recycled the buffer before we pinned it. */
- if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
{
UnpinBuffer(foundbuf, true);
goto start;
@@ -1026,7 +1065,7 @@ start:
if (oldFlags & BM_TAG_VALID)
{
/* Save old tag. */
- oldTag = buf->tag;
+ oldEnt.key = buf->tag;
}
/*
@@ -1037,9 +1076,8 @@ start:
* tag.
*/
enter:
- buf_id = BufTableInsert(&newTag, buf->buf_id);
-
- if (buf_id >= 0)
+ newEnt.id = buf->buf_id;
+ if (!CHashInsert(SharedBufHash, &newEnt))
{
BufferDesc *foundbuf;
@@ -1050,9 +1088,9 @@ enter:
* recheck the buffer tag after pinning it, because it could still
* get renamed under us.
*/
- foundbuf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[newEnt.id];
valid = PinBuffer(foundbuf, strategy);
- if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
{
UnpinBuffer(foundbuf, true);
goto enter;
@@ -1104,7 +1142,8 @@ enter:
break;
UnlockBufHdr(buf);
- BufTableDelete(&newTag);
+ if (!CHashDelete(SharedBufHash, &newEnt.key))
+ elog(ERROR, "shared buffer hash table corrupted");
UnpinBuffer(buf, true);
}
@@ -1116,7 +1155,7 @@ enter:
* the old content is no longer relevant. (The usage_count starts out at
* 1 so that the buffer can survive one clock-sweep pass.)
*/
- buf->tag = newTag;
+ buf->tag = newEnt.key;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
@@ -1126,8 +1165,9 @@ enter:
UnlockBufHdr(buf);
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
@@ -1162,11 +1202,11 @@ enter:
static void
InvalidateBuffer(volatile BufferDesc *buf)
{
- BufferTag oldTag;
+ BufferLookupEnt oldEnt;
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
- oldTag = buf->tag;
+ oldEnt.key = buf->tag;
/*
* We assume the only reason for it to be pinned is that someone else is
@@ -1187,7 +1227,7 @@ InvalidateBuffer(volatile BufferDesc *buf)
LockBufHdr(buf);
/* If it's changed while we were waiting for lock, do nothing */
- if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+ if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key))
{
UnlockBufHdr(buf);
return;
@@ -1208,8 +1248,9 @@ InvalidateBuffer(volatile BufferDesc *buf)
/*
* Remove the buffer from the lookup hashtable, if it was in there.
*/
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 3add619b5d..2410dfc272 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -432,7 +432,7 @@ StrategyShmemSize(void)
Size size = 0;
/* size of lookup hash table ... see comment in StrategyInitialize */
- size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+ size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
/* size of the shared replacement strategy control block */
size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
@@ -462,7 +462,7 @@ StrategyInitialize(bool init)
* happening in each partition concurrently, so we could need as many as
* NBuffers + NUM_BUFFER_PARTITIONS entries.
*/
- InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+ BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS);
/*
* Get or create the shared strategy control block
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 4b1696cf77..b58af88451 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -182,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
-/* buf_table.c */
-extern Size BufTableShmemSize(int size);
-extern void InitBufTable(int size);
-extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int BufTableLookup(BufferTag *tagPtr);
-extern int BufTableInsert(BufferTag *tagPtr, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr);
-
/* localbuf.c */
extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
BlockNumber blockNum);
@@ -201,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
extern void AtEOXact_LocalBuffers(bool isCommit);
+/* bufmgr.c */
+extern Size BufMgrShmemSize(int size);
+extern void BufMgrInitShmem(int size);
+
#endif /* BUFMGR_INTERNALS_H */