summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund2016-02-19 20:07:51 +0000
committerAndres Freund2019-06-11 02:01:00 +0000
commit7c5df799aa12dc43f4d8bcef78120225cda990e0 (patch)
tree58c24f441033017298ae014ea925060b5ef79197
parent53094143e3c1fc9a8090cce66e73e26d58c67b93 (diff)
Rewrite background writer.bgwriter-rewrite
This currently consists out of two major parts: 1) Add more statistics, to be able to even evaluate the effects of bgwriter changes / problems. This should probably be split into a separate commit. It's remarkable how odd the set of current measurements is, and how many different mechanisms for transporting those values we currently have. The patch adds and replaces a few measurements, but doesn't yet do enough cleanup (have fewer transport mechanisms, split into different views). 2) A new bgwriter implementation (that can be turned on by setting the bgwriter_legacy GUC to false). There's a few major differences: a) bgwriter performs the clock sweep - that makes it a lot easier to actually find buffers worthwhile to clean. It's quite possible to get into situations where the old bgwriter can't do anything for a while because all buffers have a usagecount > 0. b) When a buffer is encountered by bgwriter, after performing clock sweep, is clean and has usage/pin count of 0 (i.e. it can be reclaimed), then we also push it onto the queue. c) It just has a ringbuffer of clean buffers, that backends can drain. Bgwriter pushes (without any locks) entries onto the queue, backends can pop them of. d) The pacing logic is a lot simpler. There's a ringbuffer that bgwriter tries to fill. There's a low watermark that causes backends to wake up bgwriter.
-rw-r--r--src/backend/access/transam/xlog.c2
-rw-r--r--src/backend/catalog/system_views.sql25
-rw-r--r--src/backend/postmaster/bgwriter.c9
-rw-r--r--src/backend/postmaster/checkpointer.c38
-rw-r--r--src/backend/postmaster/pgstat.c20
-rw-r--r--src/backend/storage/buffer/buf_init.c22
-rw-r--r--src/backend/storage/buffer/bufmgr.c198
-rw-r--r--src/backend/storage/buffer/freelist.c234
-rw-r--r--src/backend/utils/adt/pgstatfuncs.c69
-rw-r--r--src/backend/utils/misc/guc.c13
-rw-r--r--src/include/catalog/pg_proc.dat74
-rw-r--r--src/include/pgstat.h49
-rw-r--r--src/include/postmaster/bgwriter.h3
-rw-r--r--src/include/storage/buf_internals.h30
-rw-r--r--src/include/storage/bufmgr.h4
-rw-r--r--src/test/regress/expected/rules.out19
16 files changed, 666 insertions, 143 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1c7dd51b9f..78c1d786fa 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8376,6 +8376,8 @@ LogCheckpointEnd(bool restartpoint)
BgWriterStats.m_checkpoint_sync_time +=
sync_secs * 1000 + sync_usecs / 1000;
+ BgWriterStats.m_buf_fsync_checkpointer += CheckpointStats.ckpt_sync_rels;
+
/*
* All of the published timing statistics are accounted for. Only
* continue if a log message is to be written.
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 78a103cdb9..d15aed10ad 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -898,12 +898,27 @@ CREATE VIEW pg_stat_bgwriter AS
pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req,
pg_stat_get_checkpoint_write_time() AS checkpoint_write_time,
pg_stat_get_checkpoint_sync_time() AS checkpoint_sync_time,
- pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
- pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
+
+ pg_stat_get_buf_written_checkpoints() AS buffers_written_checkpoint,
+ pg_stat_get_buf_written_bgwriter() AS buffers_written_bgwriter,
+ pg_stat_get_buf_written_backend() AS buffers_written_backend,
+ pg_stat_get_buf_written_ring() AS buffers_written_ring,
+
+ pg_stat_get_buf_fsync_checkpointer() AS buffers_fsync_checkpointer,
+ pg_stat_get_buf_fsync_bgwriter() AS buffers_fsync_bgwriter,
+ pg_stat_get_buf_fsync_backend() AS buffers_fsync_backend,
+
+ pg_stat_get_buf_bgwriter_clean() AS buffers_bgwriter_clean,
+
+ pg_stat_get_buf_alloc_preclean() AS buffers_alloc_preclean,
+ pg_stat_get_buf_alloc_free() AS buffers_alloc_free,
+ pg_stat_get_buf_alloc_sweep() AS buffers_alloc_sweep,
+ pg_stat_get_buf_alloc_ring() AS buffers_alloc_ring,
+
+ pg_stat_get_buf_ticks_bgwriter() AS buffers_ticks_bgwriter,
+ pg_stat_get_buf_ticks_backend() AS buffers_ticks_backend,
+
pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
- pg_stat_get_buf_written_backend() AS buffers_backend,
- pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
- pg_stat_get_buf_alloc() AS buffers_alloc,
pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
CREATE VIEW pg_stat_progress_vacuum AS
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index e6b6c549de..526304fefc 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -65,6 +65,7 @@
* GUC parameters
*/
int BgWriterDelay = 200;
+bool BgWriterLegacy = true;
/*
* Multiplier to apply to BgWriterDelay when we decide to hibernate.
@@ -264,7 +265,10 @@ BackgroundWriterMain(void)
/*
* Do one cycle of dirty-buffer writing.
*/
- can_hibernate = BgBufferSync(&wb_context);
+ if (BgWriterLegacy)
+ can_hibernate = BgBufferSyncLegacy(&wb_context);
+ else
+ can_hibernate = BgBufferSyncNew(&wb_context);
/*
* Send off activity statistics to the stats collector
@@ -366,7 +370,8 @@ BackgroundWriterMain(void)
BgWriterDelay * HIBERNATE_FACTOR,
WAIT_EVENT_BGWRITER_HIBERNATE);
/* Reset the notification request in case we timed out */
- StrategyNotifyBgWriter(-1);
+ if (BgWriterLegacy)
+ StrategyNotifyBgWriter(-1);
}
prev_hibernate = can_hibernate;
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 13f152b473..e5ecca1e3d 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -102,7 +102,7 @@
* The requests array holds fsync requests sent by backends and not yet
* absorbed by the checkpointer.
*
- * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and
+ * Unlike the checkpoint fields, num_written_*, num_fsync_*, and
* the requests fields are protected by CheckpointerCommLock.
*----------
*/
@@ -127,8 +127,11 @@ typedef struct
ConditionVariable start_cv; /* signaled when ckpt_started advances */
ConditionVariable done_cv; /* signaled when ckpt_done advances */
- uint32 num_backend_writes; /* counts user backend buffer writes */
- uint32 num_backend_fsync; /* counts user backend fsync calls */
+ uint32 num_written_backend; /* counts user backend buffer writes */
+ uint32 num_written_ring; /* counts ring buffer writes */
+
+ uint32 num_fsync_bgwriter; /* counts bgwriter fsync calls */
+ uint32 num_fsync_backend; /* counts user backend fsync calls */
int num_requests; /* current # of requests */
int max_requests; /* allocated array size */
@@ -1119,7 +1122,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
/* Count all backend writes regardless of if they fit in the queue */
if (!AmBackgroundWriterProcess())
- CheckpointerShmem->num_backend_writes++;
+ CheckpointerShmem->num_written_backend++;
/*
* If the checkpointer isn't running or the request queue is full, the
@@ -1134,8 +1137,10 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
* Count the subset of writes where backends have to do their own
* fsync
*/
- if (!AmBackgroundWriterProcess())
- CheckpointerShmem->num_backend_fsync++;
+ if (AmBackgroundWriterProcess())
+ CheckpointerShmem->num_fsync_backend++;
+ else
+ CheckpointerShmem->num_fsync_bgwriter++;
LWLockRelease(CheckpointerCommLock);
return false;
}
@@ -1295,11 +1300,15 @@ AbsorbSyncRequests(void)
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
/* Transfer stats counts into pending pgstats message */
- BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes;
- BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync;
+ BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_written_backend;
+ BgWriterStats.m_buf_written_ring += CheckpointerShmem->num_written_ring;
+ BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_fsync_backend;
+ BgWriterStats.m_buf_fsync_bgwriter += CheckpointerShmem->num_fsync_bgwriter;
- CheckpointerShmem->num_backend_writes = 0;
- CheckpointerShmem->num_backend_fsync = 0;
+ CheckpointerShmem->num_written_backend = 0;
+ CheckpointerShmem->num_written_ring = 0;
+ CheckpointerShmem->num_fsync_backend = 0;
+ CheckpointerShmem->num_fsync_bgwriter = 0;
/*
* We try to avoid holding the lock for a long time by copying the request
@@ -1373,3 +1382,12 @@ FirstCallSinceLastCheckpoint(void)
return FirstCall;
}
+
+// FIXME: crappy API
+void
+ReportRingWrite(void)
+{
+ LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
+ CheckpointerShmem->num_written_ring++;
+ LWLockRelease(CheckpointerCommLock);
+}
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b4f2b28b51..9aa7b9b813 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -6313,12 +6313,26 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
globalStats.requested_checkpoints += msg->m_requested_checkpoints;
globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
+
globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
- globalStats.buf_written_clean += msg->m_buf_written_clean;
- globalStats.maxwritten_clean += msg->m_maxwritten_clean;
+ globalStats.buf_written_bgwriter += msg->m_buf_written_bgwriter;
globalStats.buf_written_backend += msg->m_buf_written_backend;
+ globalStats.buf_written_ring += msg->m_buf_written_ring;
+
+ globalStats.buf_fsync_checkpointer += msg->m_buf_fsync_checkpointer;
+ globalStats.buf_fsync_bgwriter += msg->m_buf_fsync_bgwriter;
globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
- globalStats.buf_alloc += msg->m_buf_alloc;
+
+ globalStats.buf_alloc_preclean += msg->m_buf_alloc_preclean;
+ globalStats.buf_alloc_free += msg->m_buf_alloc_free;
+ globalStats.buf_alloc_sweep += msg->m_buf_alloc_sweep;
+ globalStats.buf_alloc_ring += msg->m_buf_alloc_ring;
+
+ globalStats.buf_ticks_bgwriter += msg->m_buf_ticks_bgwriter;
+ globalStats.buf_ticks_backend += msg->m_buf_ticks_backend;
+
+ globalStats.buf_clean_bgwriter += msg->m_buf_clean_bgwriter;
+ globalStats.maxwritten_clean += msg->m_maxwritten_clean;
}
/* ----------
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index ccd2c31c0b..6154f75714 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "lib/ringbuf.h"
#include "storage/bufmgr.h"
#include "storage/buf_internals.h"
@@ -23,6 +24,7 @@ char *BufferBlocks;
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
WritebackContext BackendWritebackContext;
CkptSortItem *CkptBufferIds;
+ringbuf *VictimBuffers = NULL;
/*
@@ -70,7 +72,8 @@ InitBufferPool(void)
bool foundBufs,
foundDescs,
foundIOLocks,
- foundBufCkpt;
+ foundBufCkpt,
+ foundFreeBufs;
/* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *)
@@ -91,6 +94,10 @@ InitBufferPool(void)
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS, "buffer_io");
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, "buffer_content");
+ VictimBuffers = ShmemInitStruct("Free Buffers",
+ ringbuf_size(VICTIM_BUFFER_PRECLEAN_SIZE),
+ &foundFreeBufs);
+
/*
* The array used to sort to-be-checkpointed buffer ids is located in
* shared memory, to avoid having to allocate significant amounts of
@@ -102,10 +109,11 @@ InitBufferPool(void)
ShmemInitStruct("Checkpoint BufferIds",
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
- if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
+ if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt || foundFreeBufs)
{
/* should find all of these, or none of them */
- Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
+ Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt && foundFreeBufs);
+
/* note: this path is only taken in EXEC_BACKEND case */
}
else
@@ -129,6 +137,7 @@ InitBufferPool(void)
/*
* Initially link all the buffers together as unused. Subsequent
* management of this list is done by freelist.c.
+ * FIXME: remove once legacy bgwriter is removed
*/
buf->freeNext = i + 1;
@@ -139,8 +148,10 @@ InitBufferPool(void)
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
}
- /* Correct last entry of linked list */
+ /* Correct last entry of linked list: FIXME: remove */
GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+ /* FIXME: could fill the first few free buffers? */
+ VictimBuffers = ringbuf_create(VictimBuffers, VICTIM_BUFFER_PRECLEAN_SIZE);
}
/* Init other shared buffer-management stuff */
@@ -189,5 +200,8 @@ BufferShmemSize(void)
/* size of checkpoint sort array in bufmgr.c */
size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
+ /* FIXME: better ringbuffer size */
+ size = add_size(size, ringbuf_size(VICTIM_BUFFER_PRECLEAN_SIZE));
+
return size;
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7332e6b590..9d63244ba0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -39,6 +39,7 @@
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
+#include "lib/ringbuf.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
@@ -101,7 +102,7 @@ typedef struct CkptTsStatus
/* already processed pages in this tablespace */
int num_scanned;
- /* current offset in CkptBufferIds for this tablespace */
+ /* currentCheckpointerShmem->num_written_ring offset in CkptBufferIds for this tablespace */
int index;
} CkptTsStatus;
@@ -866,11 +867,29 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (isExtend)
{
+ instr_time io_start,
+ io_time;
+
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+
/* new buffers are zero-filled */
MemSet((char *) bufBlock, 0, BLCKSZ);
+
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+
/* don't set checksum for all-zero page */
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_SUBTRACT(io_time, io_start);
+ pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+ INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+ }
+
/*
* NB: we're *not* doing a ScheduleBufferTagForWriteback here;
* although we're essentially performing a write. At least on linux
@@ -1136,6 +1155,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnpinBuffer(buf, true);
continue;
}
+
+ // FIXME: crappy API
+ StrategyReportWrite(strategy, buf);
}
/* OK, do the I/O */
@@ -1352,6 +1374,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* trying to write it out. We have to let them finish before we can
* reclaim the buffer.
*
+ * FIXME: ^^^
+ *
* The buffer could get reclaimed by someone else while we are waiting
* to acquire the necessary locks; if so, don't mess it up.
*/
@@ -2038,7 +2062,119 @@ BufferSync(int flags)
}
/*
- * BgBufferSync -- Write out some dirty buffers in the pool.
+ * BgBufferSyncNew -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode.
+ */
+bool
+BgBufferSyncNew(WritebackContext *wb_context)
+{
+ uint32 recent_alloc_preclean;
+ uint32 recent_alloc_free;
+ uint32 recent_alloc_sweep;
+ uint32 recent_alloc_ring;
+ uint32 strategy_passes;
+ uint64 nticks;
+ uint64 nticks_sum = 0;
+
+ /* Make sure we can handle the pin inside SyncOneBuffer */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /* Know where to start, and report buffer alloc counts to pgstat */
+ StrategySyncStart(&strategy_passes,
+ &recent_alloc_preclean,
+ &recent_alloc_free,
+ &recent_alloc_sweep,
+ &recent_alloc_ring,
+ &nticks);
+
+ /* Report buffer alloc counts to pgstat */
+ BgWriterStats.m_buf_alloc_preclean += recent_alloc_preclean;
+ BgWriterStats.m_buf_alloc_free += recent_alloc_free;
+ BgWriterStats.m_buf_alloc_sweep += recent_alloc_sweep;
+ BgWriterStats.m_buf_alloc_ring += recent_alloc_ring;
+ BgWriterStats.m_buf_ticks_backend += nticks;
+
+ /* go and populate freelist */
+ while (!ringbuf_full(VictimBuffers))
+ {
+ BufferDesc *bufHdr;
+ bool pushed;
+ bool dirty;
+ uint32 buf_state;
+
+ ReservePrivateRefCountEntry();
+
+ bufHdr = ClockSweep(NULL, &buf_state, &nticks);
+ nticks_sum += nticks;
+
+ dirty = buf_state & BM_DIRTY;
+
+ if (dirty)
+ {
+ SMgrRelation reln;
+ BufferTag tag;
+ LWLock *content_lock;
+
+
+ /*
+ * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
+ * buffer is clean by the time we've locked it.)
+ */
+ PinBuffer_Locked(bufHdr);
+
+ /* open relation before locking the page */
+ reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+
+ content_lock = BufferDescriptorGetContentLock(bufHdr);
+
+ LWLockAcquire(content_lock, LW_SHARED);
+ FlushBuffer(bufHdr, reln);
+ LWLockRelease(content_lock);
+
+ /* copy tag before releasing pin */
+ tag = bufHdr->tag;
+
+ UnpinBuffer(bufHdr, true);
+
+ pushed = ringbuf_push(VictimBuffers, bufHdr);
+
+ Assert(wb_context);
+ ScheduleBufferTagForWriteback(wb_context, &tag);
+
+ BgWriterStats.m_buf_written_bgwriter++;
+ }
+ else
+ {
+ UnlockBufHdr(bufHdr, buf_state);
+ pushed = ringbuf_push(VictimBuffers, bufHdr);
+
+ BgWriterStats.m_buf_clean_bgwriter++;
+ }
+
+ /* full, shouldn't normally happen, we're the only writer */
+ if (!pushed)
+ break;
+
+ /* so we occasionally sleep, even if continually busy */
+ if (BgWriterStats.m_buf_written_bgwriter >= bgwriter_lru_maxpages)
+ {
+ BgWriterStats.m_maxwritten_clean++;
+ break;
+ }
+ }
+
+ BgWriterStats.m_buf_ticks_bgwriter += nticks_sum;
+
+ return BgWriterStats.m_buf_written_bgwriter == 0 &&
+ BgWriterStats.m_buf_clean_bgwriter == 0;
+}
+
+/*
+ * BgBufferSyncLegacy -- Write out some dirty buffers in the pool.
*
* This is called periodically by the background writer process.
*
@@ -2049,12 +2185,16 @@ BufferSync(int flags)
* bgwriter_lru_maxpages to 0.)
*/
bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSyncLegacy(WritebackContext *wb_context)
{
/* info obtained from freelist.c */
int strategy_buf_id;
uint32 strategy_passes;
- uint32 recent_alloc;
+ uint32 recent_alloc_preclean;
+ uint32 recent_alloc_free;
+ uint32 recent_alloc_sweep;
+ uint32 recent_alloc_ring;
+ uint64 recent_ticks;
/*
* Information saved between calls so we can determine the strategy
@@ -2090,16 +2230,25 @@ BgBufferSync(WritebackContext *wb_context)
/* Variables for final smoothed_density update */
long new_strategy_delta;
- uint32 new_recent_alloc;
+ uint32 new_recent_alloc_sweep;
/*
* Find out where the freelist clock sweep currently is, and how many
* buffer allocations have happened since our last call.
*/
- strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+ strategy_buf_id = StrategySyncStart(&strategy_passes,
+ &recent_alloc_preclean,
+ &recent_alloc_free,
+ &recent_alloc_sweep,
+ &recent_alloc_ring,
+ &recent_ticks);
/* Report buffer alloc counts to pgstat */
- BgWriterStats.m_buf_alloc += recent_alloc;
+ BgWriterStats.m_buf_alloc_preclean += recent_alloc_preclean;
+ BgWriterStats.m_buf_alloc_free += recent_alloc_free;
+ BgWriterStats.m_buf_alloc_sweep += recent_alloc_sweep;
+ BgWriterStats.m_buf_alloc_ring += recent_alloc_ring;
+ BgWriterStats.m_buf_ticks_backend += recent_ticks;
/*
* If we're not running the LRU scan, just stop after doing the stats
@@ -2196,9 +2345,9 @@ BgBufferSync(WritebackContext *wb_context)
*
* If the strategy point didn't move, we don't update the density estimate
*/
- if (strategy_delta > 0 && recent_alloc > 0)
+ if (strategy_delta > 0 && recent_alloc_sweep > 0)
{
- scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+ scans_per_alloc = (float) strategy_delta / (float) recent_alloc_sweep;
smoothed_density += (scans_per_alloc - smoothed_density) /
smoothing_samples;
}
@@ -2216,10 +2365,10 @@ BgBufferSync(WritebackContext *wb_context)
* a true average we want a fast-attack, slow-decline behavior: we
* immediately follow any increase.
*/
- if (smoothed_alloc <= (float) recent_alloc)
- smoothed_alloc = recent_alloc;
+ if (smoothed_alloc <= (float) recent_alloc_sweep)
+ smoothed_alloc = recent_alloc_sweep;
else
- smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+ smoothed_alloc += ((float) recent_alloc_sweep - smoothed_alloc) /
smoothing_samples;
/* Scale the estimate by a GUC to allow more aggressive tuning. */
@@ -2297,7 +2446,7 @@ BgBufferSync(WritebackContext *wb_context)
reusable_buffers++;
}
- BgWriterStats.m_buf_written_clean += num_written;
+ BgWriterStats.m_buf_written_bgwriter += num_written;
#ifdef BGW_DEBUG
elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
@@ -2317,22 +2466,22 @@ BgBufferSync(WritebackContext *wb_context)
* density estimates.
*/
new_strategy_delta = bufs_to_lap - num_to_scan;
- new_recent_alloc = reusable_buffers - reusable_buffers_est;
- if (new_strategy_delta > 0 && new_recent_alloc > 0)
+ new_recent_alloc_sweep = reusable_buffers - reusable_buffers_est;
+ if (new_strategy_delta > 0 && new_recent_alloc_sweep > 0)
{
- scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
+ scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc_sweep;
smoothed_density += (scans_per_alloc - smoothed_density) /
smoothing_samples;
#ifdef BGW_DEBUG
elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
- new_recent_alloc, new_strategy_delta,
+ new_recent_alloc_sweep, new_strategy_delta,
scans_per_alloc, smoothed_density);
#endif
}
/* Return true if OK to hibernate */
- return (bufs_to_lap == 0 && recent_alloc == 0);
+ return (bufs_to_lap == 0 && new_recent_alloc_sweep == 0);
}
/*
@@ -4321,6 +4470,8 @@ void
IssuePendingWritebacks(WritebackContext *context)
{
int i;
+ instr_time io_start,
+ io_time;
if (context->nr_pending == 0)
return;
@@ -4332,6 +4483,9 @@ IssuePendingWritebacks(WritebackContext *context)
qsort(&context->pending_writebacks, context->nr_pending,
sizeof(PendingWriteback), buffertag_comparator);
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+
/*
* Coalesce neighbouring writes, but nothing else. For that we iterate
* through the, now sorted, array of pending flushes, and look forward to
@@ -4381,6 +4535,14 @@ IssuePendingWritebacks(WritebackContext *context)
smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
}
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_SUBTRACT(io_time, io_start);
+ pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+ INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+ }
+
context->nr_pending = 0;
}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 06659ab265..6583f1c381 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,7 +15,9 @@
*/
#include "postgres.h"
+#include "lib/ringbuf.h"
#include "port/atomics.h"
+#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/proc.h"
@@ -51,7 +53,14 @@ typedef struct
* overflow during a single bgwriter cycle.
*/
uint32 completePasses; /* Complete cycles of the clock sweep */
- pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
+
+ /* Buffers allocated since last reset */
+ pg_atomic_uint32 numBufferAllocsPreclean;
+ pg_atomic_uint32 numBufferAllocsFree;
+ pg_atomic_uint32 numBufferAllocsSweep;
+ pg_atomic_uint32 numBufferAllocsRing;
+
+ pg_atomic_uint64 numBufferTicksBackend;
/*
* Bgworker process to be notified upon activity or -1 if none. See
@@ -168,6 +177,62 @@ ClockSweepTick(void)
return victim;
}
+BufferDesc *
+ClockSweep(BufferAccessStrategy strategy, uint32 *buf_state, uint64 *nticks)
+{
+ BufferDesc *buf;
+ int trycounter;
+ uint32 local_buf_state; /* to avoid repeated (de-)referencing */
+ uint64 local_nticks = 0;
+
+ trycounter = NBuffers;
+ for (;;)
+ {
+
+ buf = GetBufferDescriptor(ClockSweepTick());
+ local_nticks++;
+
+ /*
+ * If the buffer is pinned or has a nonzero usage_count, we cannot use
+ * it; decrement the usage_count (unless pinned) and keep scanning.
+ */
+ local_buf_state = LockBufHdr(buf);
+
+ if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+ {
+ if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
+ {
+ local_buf_state -= BUF_USAGECOUNT_ONE;
+
+ trycounter = NBuffers;
+ }
+ else
+ {
+ /* Found a usable buffer */
+ if (strategy != NULL)
+ AddBufferToRing(strategy, buf);
+ *buf_state = local_buf_state;
+ *nticks = local_nticks;
+
+ return buf;
+ }
+ }
+ else if (--trycounter == 0)
+ {
+ /*
+ * We've scanned all the buffers without making any state changes,
+ * so all the buffers are pinned (or were when we looked at them).
+ * We could hope that someone will free one eventually, but it's
+ * probably better to fail than to risk getting stuck in an
+ * infinite loop.
+ */
+ UnlockBufHdr(buf, local_buf_state);
+ elog(ERROR, "no unpinned buffers available");
+ }
+ UnlockBufHdr(buf, local_buf_state);
+ }
+}
+
/*
* have_free_buffer -- a lockless check to see if there is a free buffer in
* buffer pool.
@@ -202,8 +267,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
{
BufferDesc *buf;
int bgwprocno;
- int trycounter;
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
+ uint64 nticks;
/*
* If given a strategy object, see whether it can select a buffer. We
@@ -229,27 +294,23 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
* some arbitrary process.
*/
bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
- if (bgwprocno != -1)
+ if (BgWriterLegacy)
{
- /* reset bgwprocno first, before setting the latch */
- StrategyControl->bgwprocno = -1;
+ if (bgwprocno != -1)
+ {
+ /* reset bgwprocno first, before setting the latch */
+ StrategyControl->bgwprocno = -1;
- /*
- * Not acquiring ProcArrayLock here which is slightly icky. It's
- * actually fine because procLatch isn't ever freed, so we just can
- * potentially set the wrong process' (or no process') latch.
- */
- SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+ /*
+ * Not acquiring ProcArrayLock here which is slightly icky. It's
+ * actually fine because procLatch isn't ever freed, so we just can
+ * potentially set the wrong process' (or no process') latch.
+ */
+ SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+ }
}
/*
- * We count buffer allocation requests so that the bgwriter can estimate
- * the rate of buffer consumption. Note that buffers recycled by a
- * strategy object are intentionally not counted here.
- */
- pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
-
- /*
* First check, without acquiring the lock, whether there's buffers in the
* freelist. Since we otherwise don't require the spinlock in every
* StrategyGetBuffer() invocation, it'd be sad to acquire it here -
@@ -302,6 +363,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
{
+ // FIXME: possible to do outside of lock?
+ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsFree, 1);
+
if (strategy != NULL)
AddBufferToRing(strategy, buf);
*buf_state = local_buf_state;
@@ -312,51 +376,81 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
}
}
- /* Nothing on the freelist, so run the "clock sweep" algorithm */
- trycounter = NBuffers;
- for (;;)
+ if (!BgWriterLegacy)
{
- buf = GetBufferDescriptor(ClockSweepTick());
+ int i = 0;
/*
- * If the buffer is pinned or has a nonzero usage_count, we cannot use
- * it; decrement the usage_count (unless pinned) and keep scanning.
+ * Try to get a buffer from the clean buffer list.
*/
- local_buf_state = LockBufHdr(buf);
-
- if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+ while (!ringbuf_empty(VictimBuffers))
{
- if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
- {
- local_buf_state -= BUF_USAGECOUNT_ONE;
+ BufferDesc *buf;
+ bool found;
+ uint32 elements;
- trycounter = NBuffers;
+ found = ringbuf_pop(VictimBuffers, (void *)&buf);
+
+ /* If the ringbuffer is sufficiently depleted, wakeup the bgwriter. */
+ if (bgwprocno != -1 &&
+ (!found ||
+ (elements = ringbuf_elements(VictimBuffers)) < VICTIM_BUFFER_PRECLEAN_SIZE / 4))
+ {
+#if 0
+ if (!found)
+ elog(LOG, "signalling bgwriter: empty");
+ else
+ elog(LOG, "signalling bgwriter: watermark: %u %u/%u",
+ elements, VICTIM_BUFFER_PRECLEAN_SIZE / 4, VICTIM_BUFFER_PRECLEAN_SIZE);
+#endif
+ SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
}
- else
+
+ if (!found)
+ break;
+
+ /* check if the buffer is still unused, done if so */
+ local_buf_state = LockBufHdr(buf);
+ if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+ && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
{
- /* Found a usable buffer */
+ // FIXME: possible to do outside of lock?
+ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsPreclean, 1);
+
if (strategy != NULL)
AddBufferToRing(strategy, buf);
*buf_state = local_buf_state;
return buf;
}
+ else
+ {
+ UnlockBufHdr(buf, local_buf_state);
+ //ereport(LOG, (errmsg("buffer %u since reused (hand at %u)",
+ // buf->buf_id,
+ // pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers),
+ // errhidestmt(true)));
+ }
+
+ i++;
}
- else if (--trycounter == 0)
- {
- /*
- * We've scanned all the buffers without making any state changes,
- * so all the buffers are pinned (or were when we looked at them).
- * We could hope that someone will free one eventually, but it's
- * probably better to fail than to risk getting stuck in an
- * infinite loop.
- */
- UnlockBufHdr(buf, local_buf_state);
- elog(ERROR, "no unpinned buffers available");
- }
- UnlockBufHdr(buf, local_buf_state);
+
+#if 0
+ ereport(LOG, (errmsg("ringbuf empty after %u cycles", i),
+ errhidestmt(true)));
+#endif
+
}
+
+ /* Nothing on the freelist, so run the "clock sweep" algorithm */
+ buf = ClockSweep(strategy, buf_state, &nticks);
+
+ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsSweep, 1);
+ pg_atomic_fetch_add_u64(&StrategyControl->numBufferTicksBackend, nticks);
+
+ return buf;
}
+
/*
* StrategyFreeBuffer: put a buffer on the freelist
*/
@@ -381,18 +475,22 @@ StrategyFreeBuffer(BufferDesc *buf)
}
/*
- * StrategySyncStart -- tell BufferSync where to start syncing
+ * StrategySyncStart -- tell BgBufferSync where to start syncing
*
- * The result is the buffer index of the best buffer to sync first.
- * BufferSync() will proceed circularly around the buffer array from there.
+ * The result is the buffer index below the current clock-hand. BgBufferSync()
+ * will proceed circularly around the buffer array from there.
*
- * In addition, we return the completed-pass count (which is effectively
- * the higher-order bits of nextVictimBuffer) and the count of recent buffer
- * allocs if non-NULL pointers are passed. The alloc count is reset after
- * being read.
+ * In addition, we return the completed-pass count (which is effectively the
+ * higher-order bits of nextVictimBuffer) and the counts of recent buffer
+ * allocations. The allocation counts are reset after being read.
*/
int
-StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+StrategySyncStart(uint32 *complete_passes,
+ uint32 *alloc_preclean,
+ uint32 *alloc_free,
+ uint32 *alloc_sweep,
+ uint32 *alloc_ring,
+ uint64 *ticks_backend)
{
uint32 nextVictimBuffer;
int result;
@@ -410,13 +508,16 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
* completePasses could be incremented. C.f. ClockSweepTick().
*/
*complete_passes += nextVictimBuffer / NBuffers;
- }
- if (num_buf_alloc)
- {
- *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
}
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
+ *alloc_preclean = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsPreclean, 0);
+ *alloc_free = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsFree, 0);
+ *alloc_sweep = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsSweep, 0);
+ *alloc_ring = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsRing, 0);
+ *ticks_backend = pg_atomic_exchange_u64(&StrategyControl->numBufferTicksBackend, 0);
+
return result;
}
@@ -517,7 +618,11 @@ StrategyInitialize(bool init)
/* Clear statistics */
StrategyControl->completePasses = 0;
- pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocsPreclean, 0);
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocsFree, 0);
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocsSweep, 0);
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocsRing, 0);
+ pg_atomic_init_u64(&StrategyControl->numBufferTicksBackend, 0);
/* No pending notification */
StrategyControl->bgwprocno = -1;
@@ -645,6 +750,9 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
{
+ // FIXME: possible to do outside of lock?
+ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsRing, 1);
+
strategy->current_was_in_ring = true;
*buf_state = local_buf_state;
return buf;
@@ -702,3 +810,11 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
return true;
}
+
+void
+StrategyReportWrite(BufferAccessStrategy strategy,
+ BufferDesc *buf)
+{
+ if (strategy->current_was_in_ring)
+ ReportRingWrite();
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 05240bfd14..d0d163ea35 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1604,15 +1604,45 @@ pg_stat_get_bgwriter_requested_checkpoints(PG_FUNCTION_ARGS)
}
Datum
-pg_stat_get_bgwriter_buf_written_checkpoints(PG_FUNCTION_ARGS)
+pg_stat_get_buf_written_checkpoints(PG_FUNCTION_ARGS)
{
PG_RETURN_INT64(pgstat_fetch_global()->buf_written_checkpoints);
}
Datum
-pg_stat_get_bgwriter_buf_written_clean(PG_FUNCTION_ARGS)
+pg_stat_get_buf_written_bgwriter(PG_FUNCTION_ARGS)
{
- PG_RETURN_INT64(pgstat_fetch_global()->buf_written_clean);
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_written_bgwriter);
+}
+
+Datum
+pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_written_backend);
+}
+
+Datum
+pg_stat_get_buf_written_ring(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_written_ring);
+}
+
+Datum
+pg_stat_get_buf_ticks_bgwriter(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_ticks_bgwriter);
+}
+
+Datum
+pg_stat_get_buf_ticks_backend(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_ticks_backend);
+}
+
+Datum
+pg_stat_get_buf_bgwriter_clean(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_clean_bgwriter);
}
Datum
@@ -1641,10 +1671,17 @@ pg_stat_get_bgwriter_stat_reset_time(PG_FUNCTION_ARGS)
PG_RETURN_TIMESTAMPTZ(pgstat_fetch_global()->stat_reset_timestamp);
}
+// FIXME: name
Datum
-pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS)
+pg_stat_get_buf_fsync_checkpointer(PG_FUNCTION_ARGS)
{
- PG_RETURN_INT64(pgstat_fetch_global()->buf_written_backend);
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_fsync_checkpointer);
+}
+
+Datum
+pg_stat_get_buf_fsync_bgwriter(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_fsync_backend);
}
Datum
@@ -1654,9 +1691,27 @@ pg_stat_get_buf_fsync_backend(PG_FUNCTION_ARGS)
}
Datum
-pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
+pg_stat_get_buf_alloc_preclean(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_preclean);
+}
+
+Datum
+pg_stat_get_buf_alloc_free(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_free);
+}
+
+Datum
+pg_stat_get_buf_alloc_sweep(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_sweep);
+}
+
+Datum
+pg_stat_get_buf_alloc_ring(PG_FUNCTION_ARGS)
{
- PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc);
+ PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_ring);
}
Datum
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1208eb9a68..425d057a47 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1435,6 +1435,17 @@ static struct config_bool ConfigureNamesBool[] =
},
{
+ {"bgwriter_legacy", PGC_SIGHUP, RESOURCES_BGWRITER,
+ gettext_noop("Use legacy bgwriter algorithm."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &BgWriterLegacy,
+ true,
+ NULL, NULL, NULL
+ },
+
+ {
{"trace_notify", PGC_USERSET, DEVELOPER_OPTIONS,
gettext_noop("Generates debugging output for LISTEN and NOTIFY."),
NULL,
@@ -2734,7 +2745,7 @@ static struct config_int ConfigureNamesInt[] =
GUC_UNIT_MS
},
&BgWriterDelay,
- 200, 10, 10000,
+ 200, 1, 10000,
NULL, NULL, NULL
},
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 87335248a0..464e088c34 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5325,16 +5325,70 @@
proname => 'pg_stat_get_bgwriter_requested_checkpoints', provolatile => 's',
proparallel => 'r', prorettype => 'int8', proargtypes => '',
prosrc => 'pg_stat_get_bgwriter_requested_checkpoints' },
+
{ oid => '2771',
descr => 'statistics: number of buffers written by the bgwriter during checkpoints',
- proname => 'pg_stat_get_bgwriter_buf_written_checkpoints', provolatile => 's',
+ proname => 'pg_stat_get_buf_written_checkpoints', provolatile => 's',
proparallel => 'r', prorettype => 'int8', proargtypes => '',
- prosrc => 'pg_stat_get_bgwriter_buf_written_checkpoints' },
+ prosrc => 'pg_stat_get_buf_written_checkpoints' },
{ oid => '2772',
descr => 'statistics: number of buffers written by the bgwriter for cleaning dirty buffers',
- proname => 'pg_stat_get_bgwriter_buf_written_clean', provolatile => 's',
+ proname => 'pg_stat_get_buf_written_bgwriter', provolatile => 's',
+ proparallel => 'r', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_get_buf_written_bgwriter' },
+
+{ oid => '2775',
+ descr => 'statistics: number of buffers written by backends while cleaning dirty buffers',
+ proname => 'pg_stat_get_buf_written_backend', provolatile => 's',
+ proparallel => 'r', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_get_buf_written_backend' },
+{ oid => '270',
+ descr => 'statistics: number of buffers written by backends when recycling ring entries',
+ proname => 'pg_stat_get_buf_written_ring', provolatile => 's',
+ proparallel => 'r', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_get_buf_written_ring' },
+
+{ oid => '271',
+ descr => 'statistics: number of fsync requests processed by checkpointer',
+ proname => 'pg_stat_get_buf_fsync_checkpointer', provolatile => 's',
+ proparallel => 'r', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_get_buf_fsync_checkpointer' },
+{ oid => '272',
+ descr => 'statistics: number of bgwriter buffer writes that did their own fsync',
+ proname => 'pg_stat_get_buf_fsync_bgwriter', provolatile => 's',
+ proparallel => 'r', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_get_buf_fsync_bgwriter' },
+{ oid => '3063',
+ descr => 'statistics: number of backend writes that did their own fsync',
+ proname => 'pg_stat_get_buf_fsync_backend', provolatile => 's',
proparallel => 'r', prorettype => 'int8', proargtypes => '',
- prosrc => 'pg_stat_get_bgwriter_buf_written_clean' },
+ prosrc => 'pg_stat_get_buf_fsync_backend' },
+
+{ oid => '273', descr => 'statistics: number of reusable clean buffers discovered by bgwriter',
+ proname => 'pg_stat_get_buf_bgwriter_clean', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_bgwriter_clean' },
+
+{ oid => '380', descr => 'statistics: number of backend buffer allocations via preclean list',
+ proname => 'pg_stat_get_buf_alloc_preclean', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_preclean' },
+{ oid => '2859', descr => 'statistics: number of backend buffer allocations via backend clock sweep',
+ proname => 'pg_stat_get_buf_alloc_sweep', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_sweep' },
+{ oid => '381', descr => 'statistics: number of backend buffer allocations via ring buffer',
+ proname => 'pg_stat_get_buf_alloc_ring', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_ring' },
+{ oid => '421', descr => 'statistics: number of backend buffer allocations via free list',
+ proname => 'pg_stat_get_buf_alloc_free', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_free' },
+
+{ oid => '560', descr => 'statistics: number of clock sweep ticks by bgwriter',
+ proname => 'pg_stat_get_buf_ticks_bgwriter', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_ticks_bgwriter' },
+{ oid => '561', descr => 'statistics: number of clock sweep ticks by backend',
+ proname => 'pg_stat_get_buf_ticks_backend', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_ticks_backend' },
+
+
{ oid => '2773',
descr => 'statistics: number of times the bgwriter stopped processing when it had written too many buffers while cleaning',
proname => 'pg_stat_get_bgwriter_maxwritten_clean', provolatile => 's',
@@ -5354,18 +5408,6 @@
proname => 'pg_stat_get_checkpoint_sync_time', provolatile => 's',
proparallel => 'r', prorettype => 'float8', proargtypes => '',
prosrc => 'pg_stat_get_checkpoint_sync_time' },
-{ oid => '2775', descr => 'statistics: number of buffers written by backends',
- proname => 'pg_stat_get_buf_written_backend', provolatile => 's',
- proparallel => 'r', prorettype => 'int8', proargtypes => '',
- prosrc => 'pg_stat_get_buf_written_backend' },
-{ oid => '3063',
- descr => 'statistics: number of backend buffer writes that did their own fsync',
- proname => 'pg_stat_get_buf_fsync_backend', provolatile => 's',
- proparallel => 'r', prorettype => 'int8', proargtypes => '',
- prosrc => 'pg_stat_get_buf_fsync_backend' },
-{ oid => '2859', descr => 'statistics: number of buffer allocations',
- proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r',
- prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' },
{ oid => '2978', descr => 'statistics: number of function calls',
proname => 'pg_stat_get_function_calls', provolatile => 's',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0a3ad3a188..54c4765fb1 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -413,14 +413,30 @@ typedef struct PgStat_MsgBgWriter
PgStat_Counter m_timed_checkpoints;
PgStat_Counter m_requested_checkpoints;
+ PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
+ PgStat_Counter m_checkpoint_sync_time;
+
PgStat_Counter m_buf_written_checkpoints;
- PgStat_Counter m_buf_written_clean;
- PgStat_Counter m_maxwritten_clean;
+ PgStat_Counter m_buf_written_bgwriter;
PgStat_Counter m_buf_written_backend;
+ PgStat_Counter m_buf_written_ring;
+
+ PgStat_Counter m_buf_fsync_checkpointer;
+ PgStat_Counter m_buf_fsync_bgwriter;
PgStat_Counter m_buf_fsync_backend;
- PgStat_Counter m_buf_alloc;
- PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
- PgStat_Counter m_checkpoint_sync_time;
+
+ PgStat_Counter m_buf_clean_bgwriter;
+
+ PgStat_Counter m_buf_alloc_preclean;
+ PgStat_Counter m_buf_alloc_free;
+ PgStat_Counter m_buf_alloc_sweep;
+ PgStat_Counter m_buf_alloc_ring;
+
+ PgStat_Counter m_buf_ticks_bgwriter;
+ PgStat_Counter m_buf_ticks_backend;
+
+ PgStat_Counter m_maxwritten_clean;
+
} PgStat_MsgBgWriter;
/* ----------
@@ -699,16 +715,33 @@ typedef struct PgStat_ArchiverStats
typedef struct PgStat_GlobalStats
{
TimestampTz stats_timestamp; /* time of stats file update */
+
PgStat_Counter timed_checkpoints;
PgStat_Counter requested_checkpoints;
PgStat_Counter checkpoint_write_time; /* times in milliseconds */
PgStat_Counter checkpoint_sync_time;
+
PgStat_Counter buf_written_checkpoints;
- PgStat_Counter buf_written_clean;
- PgStat_Counter maxwritten_clean;
+ PgStat_Counter buf_written_bgwriter;
PgStat_Counter buf_written_backend;
+ PgStat_Counter buf_written_ring;
+
+ PgStat_Counter buf_fsync_checkpointer;
+ PgStat_Counter buf_fsync_bgwriter;
PgStat_Counter buf_fsync_backend;
- PgStat_Counter buf_alloc;
+
+ PgStat_Counter buf_clean_bgwriter;
+
+ PgStat_Counter buf_alloc_preclean;
+ PgStat_Counter buf_alloc_free;
+ PgStat_Counter buf_alloc_sweep;
+ PgStat_Counter buf_alloc_ring;
+
+ PgStat_Counter buf_ticks_bgwriter;
+ PgStat_Counter buf_ticks_backend;
+
+ PgStat_Counter maxwritten_clean;
+
TimestampTz stat_reset_timestamp;
} PgStat_GlobalStats;
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 630366f49e..892e24e083 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -26,6 +26,7 @@ extern int BgWriterDelay;
extern int CheckPointTimeout;
extern int CheckPointWarning;
extern double CheckPointCompletionTarget;
+extern bool BgWriterLegacy;
extern void BackgroundWriterMain(void) pg_attribute_noreturn();
extern void CheckpointerMain(void) pg_attribute_noreturn();
@@ -40,6 +41,8 @@ extern void AbsorbSyncRequests(void);
extern Size CheckpointerShmemSize(void);
extern void CheckpointerShmemInit(void);
+extern void ReportRingWrite(void);
+
extern bool FirstCallSinceLastCheckpoint(void);
#endif /* _BGWRITER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index df2dda7e7e..1b58b1db0d 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -142,7 +142,7 @@ typedef struct buftag
* single atomic operation, without actually acquiring and releasing spinlock;
* for instance, increase or decrease refcount. buf_id field never changes
* after initialization, so does not need locking. freeNext is protected by
- * the buffer_strategy_lock not buffer header lock. The LWLock can take care
+ * the buffer_strategy_lock not buffer header lock (XXX: remove). The LWLock can take care
* of itself. The buffer header lock is *not* used to control access to the
* data in the buffer!
*
@@ -184,7 +184,9 @@ typedef struct BufferDesc
pg_atomic_uint32 state;
int wait_backend_pid; /* backend PID of pin-count waiter */
- int freeNext; /* link in freelist chain */
+
+ /* link in freelist chain: only used with legacy bgwriter */
+ int freeNext;
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
@@ -232,11 +234,19 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
/*
* The freeNext field is either the index of the next freelist entry,
* or one of these special values:
+ * XXX: Remove when removing legacy bgwriter
*/
#define FREENEXT_END_OF_LIST (-1)
#define FREENEXT_NOT_IN_LIST (-2)
/*
+ * FIXME: Probably needs to depend on NBuffers or such.
+ */
+
+/* size of buffer free list */
+#define VICTIM_BUFFER_PRECLEAN_SIZE 4096
+
+/*
* Functions for acquiring/releasing a shared buffer header's spinlock. Do
* not apply these to local buffers!
*/
@@ -274,6 +284,7 @@ typedef struct WritebackContext
/* in buf_init.c */
extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
extern PGDLLIMPORT WritebackContext BackendWritebackContext;
+extern PGDLLIMPORT struct ringbuf *VictimBuffers;
/* in localbuf.c */
extern BufferDesc *LocalBufferDescriptors;
@@ -306,13 +317,24 @@ extern void IssuePendingWritebacks(WritebackContext *context);
extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
/* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategym,
uint32 *buf_state);
+extern BufferDesc *ClockSweep(BufferAccessStrategy strategy,
+ uint32 *buf_state, uint64 *nticks);
+
extern void StrategyFreeBuffer(BufferDesc *buf);
extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
BufferDesc *buf);
-extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategyReportWrite(BufferAccessStrategy strategy,
+ BufferDesc *buf);
+
+extern int StrategySyncStart(uint32 *complete_passes,
+ uint32 *alloc_preclean,
+ uint32 *alloc_free,
+ uint32 *alloc_sweep,
+ uint32 *alloc_ring,
+ uint64 *ticks_backend);
extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 509f4b7ef1..9957b9c8c2 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -221,7 +221,9 @@ extern bool HoldingBufferPinThatDelaysRecovery(void);
extern void AbortBufferIO(void);
extern void BufmgrCommit(void);
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+
+extern bool BgBufferSyncNew(struct WritebackContext *wb_context);
+extern bool BgBufferSyncLegacy(struct WritebackContext *wb_context);
extern void AtProcExit_LocalBuffers(void);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 7d365c48d1..da436d982a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1796,12 +1796,21 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req,
pg_stat_get_checkpoint_write_time() AS checkpoint_write_time,
pg_stat_get_checkpoint_sync_time() AS checkpoint_sync_time,
- pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
- pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
+ pg_stat_get_buf_written_checkpoints() AS buffers_written_checkpoint,
+ pg_stat_get_buf_written_bgwriter() AS buffers_written_bgwriter,
+ pg_stat_get_buf_written_backend() AS buffers_written_backend,
+ pg_stat_get_buf_written_ring() AS buffers_written_ring,
+ pg_stat_get_buf_fsync_checkpointer() AS buffers_fsync_checkpointer,
+ pg_stat_get_buf_fsync_bgwriter() AS buffers_fsync_bgwriter,
+ pg_stat_get_buf_fsync_backend() AS buffers_fsync_backend,
+ pg_stat_get_buf_bgwriter_clean() AS buffers_bgwriter_clean,
+ pg_stat_get_buf_alloc_preclean() AS buffers_alloc_preclean,
+ pg_stat_get_buf_alloc_free() AS buffers_alloc_free,
+ pg_stat_get_buf_alloc_sweep() AS buffers_alloc_sweep,
+ pg_stat_get_buf_alloc_ring() AS buffers_alloc_ring,
+ pg_stat_get_buf_ticks_bgwriter() AS buffers_ticks_bgwriter,
+ pg_stat_get_buf_ticks_backend() AS buffers_ticks_backend,
pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
- pg_stat_get_buf_written_backend() AS buffers_backend,
- pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
- pg_stat_get_buf_alloc() AS buffers_alloc,
pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
pg_stat_database| SELECT d.oid AS datid,
d.datname,