PostgreSQL Source Code git master
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/read_stream.h"
62#include "storage/smgr.h"
63#include "storage/standby.h"
64#include "utils/memdebug.h"
65#include "utils/ps_status.h"
66#include "utils/rel.h"
67#include "utils/resowner.h"
68#include "utils/timestamp.h"
69
70
71/* Note: these two macros only work on shared buffers, not local ones! */
72#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74
75/* Note: this macro only works on local buffers, not shared ones! */
76#define LocalBufHdrGetBlock(bufHdr) \
77 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78
79/* Bits in SyncOneBuffer's return value */
80#define BUF_WRITTEN 0x01
81#define BUF_REUSABLE 0x02
82
83#define RELS_BSEARCH_THRESHOLD 20
84
85/*
86 * This is the size (in the number of blocks) above which we scan the
87 * entire buffer pool to remove the buffers for all the pages of relation
88 * being dropped. For the relations with size below this threshold, we find
89 * the buffers by doing lookups in BufMapping table.
90 */
91#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92
94{
98
99/* 64 bytes, about the size of a cache line on common systems */
100#define REFCOUNT_ARRAY_ENTRIES 8
101
102/*
103 * Status of buffers to checkpoint for a particular tablespace, used
104 * internally in BufferSync.
105 */
106typedef struct CkptTsStatus
107{
108 /* oid of the tablespace */
110
111 /*
112 * Checkpoint progress for this tablespace. To make progress comparable
113 * between tablespaces the progress is, for each tablespace, measured as a
114 * number between 0 and the total number of to-be-checkpointed pages. Each
115 * page checkpointed in this tablespace increments this space's progress
116 * by progress_slice.
117 */
120
121 /* number of to-be checkpointed pages in this tablespace */
123 /* already processed pages in this tablespace */
125
126 /* current offset in CkptBufferIds for this tablespace */
127 int index;
129
130/*
131 * Type for array used to sort SMgrRelations
132 *
133 * FlushRelationsAllBuffers shares the same comparator function with
134 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
135 * compatible.
136 */
137typedef struct SMgrSortArray
138{
139 RelFileLocator rlocator; /* This must be the first member */
142
143/* GUC variables */
147bool track_io_timing = false;
148
149/*
150 * How many buffers PrefetchBuffer callers should try to stay ahead of their
151 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
152 * for buffers not belonging to tablespaces that have their
153 * effective_io_concurrency parameter set.
154 */
156
157/*
158 * Like effective_io_concurrency, but used by maintenance code paths that might
159 * benefit from a higher setting because they work on behalf of many sessions.
160 * Overridden by the tablespace setting of the same name.
161 */
163
164/*
165 * Limit on how many blocks should be handled in single I/O operations.
166 * StartReadBuffers() callers should respect it, as should other operations
167 * that call smgr APIs directly. It is computed as the minimum of underlying
168 * GUCs io_combine_limit_guc and io_max_combine_limit.
169 */
173
174/*
175 * GUC variables about triggering kernel writeback for buffers written; OS
176 * dependent defaults are set via the GUC mechanism.
177 */
181
182/* local state for LockBufferForCleanup */
184
185/*
186 * Backend-Private refcount management:
187 *
188 * Each buffer also has a private refcount that keeps track of the number of
189 * times the buffer is pinned in the current process. This is so that the
190 * shared refcount needs to be modified only once if a buffer is pinned more
191 * than once by an individual backend. It's also used to check that no buffers
192 * are still pinned at the end of transactions and when exiting.
193 *
194 *
195 * To avoid - as we used to - requiring an array with NBuffers entries to keep
196 * track of local buffers, we use a small sequentially searched array
197 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
198 * keep track of backend local pins.
199 *
200 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
201 * refcounts are kept track of in the array; after that, new array entries
202 * displace old ones into the hash table. That way a frequently used entry
203 * can't get "stuck" in the hashtable while infrequent ones clog the array.
204 *
205 * Note that in most scenarios the number of pinned buffers will not exceed
206 * REFCOUNT_ARRAY_ENTRIES.
207 *
208 *
209 * To enter a buffer into the refcount tracking mechanism first reserve a free
210 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
211 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
212 * memory allocations in NewPrivateRefCountEntry() which can be important
213 * because in some scenarios it's called with a spinlock held...
214 */
220
222
223static void ReservePrivateRefCountEntry(void);
228
229/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
230static void ResOwnerReleaseBufferIO(Datum res);
231static char *ResOwnerPrintBufferIO(Datum res);
232static void ResOwnerReleaseBufferPin(Datum res);
233static char *ResOwnerPrintBufferPin(Datum res);
234
236{
237 .name = "buffer io",
238 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239 .release_priority = RELEASE_PRIO_BUFFER_IOS,
240 .ReleaseResource = ResOwnerReleaseBufferIO,
241 .DebugPrint = ResOwnerPrintBufferIO
242};
243
245{
246 .name = "buffer pin",
247 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
248 .release_priority = RELEASE_PRIO_BUFFER_PINS,
249 .ReleaseResource = ResOwnerReleaseBufferPin,
250 .DebugPrint = ResOwnerPrintBufferPin
251};
252
253/*
254 * Ensure that the PrivateRefCountArray has sufficient space to store one more
255 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
256 * a new entry - but it's perfectly fine to not use a reserved entry.
257 */
258static void
260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
320
321/*
322 * Fill a previously reserved refcount entry.
323 */
326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}
342
343/*
344 * Return the PrivateRefCount entry for the passed buffer.
345 *
346 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
347 * do_move is true, and the entry resides in the hashtable the entry is
348 * optimized for frequent access by moving it to the array.
349 */
352{
354 int i;
355
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
418
419/*
420 * Returns how many times the passed buffer is pinned by this backend.
421 *
422 * Only works for shared memory buffers!
423 */
424static inline int32
426{
428
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
442
443/*
444 * Release resources used to track the reference count of a buffer which we no
445 * longer have pinned and don't want to pin again immediately.
446 */
447static void
449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
475
476/*
477 * BufferIsPinned
478 * True iff the buffer is pinned (also checks for valid buffer number).
479 *
480 * NOTE: what we check here is that *this* backend holds a pin on
481 * the buffer. We do not care whether some other backend does.
482 */
483#define BufferIsPinned(bufnum) \
484( \
485 !BufferIsValid(bufnum) ? \
486 false \
487 : \
488 BufferIsLocal(bufnum) ? \
489 (LocalRefCount[-(bufnum) - 1] > 0) \
490 : \
491 (GetPrivateRefCount(bufnum) > 0) \
492)
493
494
496 SMgrRelation smgr, char smgr_persistence,
497 ForkNumber forkNum, BlockNumber blockNum,
500 ForkNumber fork,
501 BufferAccessStrategy strategy,
502 uint32 flags,
503 uint32 extend_by,
504 BlockNumber extend_upto,
505 Buffer *buffers,
506 uint32 *extended_by);
508 ForkNumber fork,
509 BufferAccessStrategy strategy,
510 uint32 flags,
511 uint32 extend_by,
512 BlockNumber extend_upto,
513 Buffer *buffers,
514 uint32 *extended_by);
515static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
516static void PinBuffer_Locked(BufferDesc *buf);
517static void UnpinBuffer(BufferDesc *buf);
518static void UnpinBufferNoOwner(BufferDesc *buf);
519static void BufferSync(int flags);
521static int SyncOneBuffer(int buf_id, bool skip_recently_used,
522 WritebackContext *wb_context);
523static void WaitIO(BufferDesc *buf);
524static void AbortBufferIO(Buffer buffer);
525static void shared_buffer_write_error_callback(void *arg);
526static void local_buffer_write_error_callback(void *arg);
527static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
528 char relpersistence,
529 ForkNumber forkNum,
530 BlockNumber blockNum,
531 BufferAccessStrategy strategy,
532 bool *foundPtr, IOContext io_context);
533static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
534static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
535static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
536static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
537 IOObject io_object, IOContext io_context);
538static void FindAndDropRelationBuffers(RelFileLocator rlocator,
539 ForkNumber forkNum,
540 BlockNumber nForkBlock,
541 BlockNumber firstDelBlock);
543 RelFileLocator dstlocator,
544 ForkNumber forkNum, bool permanent);
545static void AtProcExit_Buffers(int code, Datum arg);
546static void CheckForBufferLeaks(void);
547#ifdef USE_ASSERT_CHECKING
548static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
549 void *unused_context);
550#endif
551static int rlocator_comparator(const void *p1, const void *p2);
552static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
553static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
554static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
555
556
557/*
558 * Implementation of PrefetchBuffer() for shared buffers.
559 */
562 ForkNumber forkNum,
563 BlockNumber blockNum)
564{
565 PrefetchBufferResult result = {InvalidBuffer, false};
566 BufferTag newTag; /* identity of requested block */
567 uint32 newHash; /* hash value for newTag */
568 LWLock *newPartitionLock; /* buffer partition lock for it */
569 int buf_id;
570
571 Assert(BlockNumberIsValid(blockNum));
572
573 /* create a tag so we can lookup the buffer */
574 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
575 forkNum, blockNum);
576
577 /* determine its hash code and partition lock ID */
578 newHash = BufTableHashCode(&newTag);
579 newPartitionLock = BufMappingPartitionLock(newHash);
580
581 /* see if the block is in the buffer pool already */
582 LWLockAcquire(newPartitionLock, LW_SHARED);
583 buf_id = BufTableLookup(&newTag, newHash);
584 LWLockRelease(newPartitionLock);
585
586 /* If not in buffers, initiate prefetch */
587 if (buf_id < 0)
588 {
589#ifdef USE_PREFETCH
590 /*
591 * Try to initiate an asynchronous read. This returns false in
592 * recovery if the relation file doesn't exist.
593 */
594 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
595 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
596 {
597 result.initiated_io = true;
598 }
599#endif /* USE_PREFETCH */
600 }
601 else
602 {
603 /*
604 * Report the buffer it was in at that time. The caller may be able
605 * to avoid a buffer table lookup, but it's not pinned and it must be
606 * rechecked!
607 */
608 result.recent_buffer = buf_id + 1;
609 }
610
611 /*
612 * If the block *is* in buffers, we do nothing. This is not really ideal:
613 * the block might be just about to be evicted, which would be stupid
614 * since we know we are going to need it soon. But the only easy answer
615 * is to bump the usage_count, which does not seem like a great solution:
616 * when the caller does ultimately touch the block, usage_count would get
617 * bumped again, resulting in too much favoritism for blocks that are
618 * involved in a prefetch sequence. A real fix would involve some
619 * additional per-buffer state, and it's not clear that there's enough of
620 * a problem to justify that.
621 */
622
623 return result;
624}
625
626/*
627 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
628 *
629 * This is named by analogy to ReadBuffer but doesn't actually allocate a
630 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
631 * block will not be delayed by the I/O. Prefetching is optional.
632 *
633 * There are three possible outcomes:
634 *
635 * 1. If the block is already cached, the result includes a valid buffer that
636 * could be used by the caller to avoid the need for a later buffer lookup, but
637 * it's not pinned, so the caller must recheck it.
638 *
639 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
640 * true. Currently there is no way to know if the data was already cached by
641 * the kernel and therefore didn't really initiate I/O, and no way to know when
642 * the I/O completes other than using synchronous ReadBuffer().
643 *
644 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
645 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
646 * lack of a kernel facility), direct I/O is enabled, or the underlying
647 * relation file wasn't found and we are in recovery. (If the relation file
648 * wasn't found and we are not in recovery, an error is raised).
649 */
652{
653 Assert(RelationIsValid(reln));
654 Assert(BlockNumberIsValid(blockNum));
655
656 if (RelationUsesLocalBuffers(reln))
657 {
658 /* see comments in ReadBufferExtended */
659 if (RELATION_IS_OTHER_TEMP(reln))
661 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
662 errmsg("cannot access temporary tables of other sessions")));
663
664 /* pass it off to localbuf.c */
665 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
666 }
667 else
668 {
669 /* pass it to the shared buffer version */
670 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
671 }
672}
673
674/*
675 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
676 *
677 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
678 * successful. Return true if the buffer is valid and still has the expected
679 * tag. In that case, the buffer is pinned and the usage count is bumped.
680 */
681bool
683 Buffer recent_buffer)
684{
685 BufferDesc *bufHdr;
686 BufferTag tag;
687 uint32 buf_state;
688 bool have_private_ref;
689
690 Assert(BufferIsValid(recent_buffer));
691
694 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
695
696 if (BufferIsLocal(recent_buffer))
697 {
698 int b = -recent_buffer - 1;
699
700 bufHdr = GetLocalBufferDescriptor(b);
701 buf_state = pg_atomic_read_u32(&bufHdr->state);
702
703 /* Is it still valid and holding the right tag? */
704 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
705 {
706 PinLocalBuffer(bufHdr, true);
707
709
710 return true;
711 }
712 }
713 else
714 {
715 bufHdr = GetBufferDescriptor(recent_buffer - 1);
716 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
717
718 /*
719 * Do we already have this buffer pinned with a private reference? If
720 * so, it must be valid and it is safe to check the tag without
721 * locking. If not, we have to lock the header first and then check.
722 */
723 if (have_private_ref)
724 buf_state = pg_atomic_read_u32(&bufHdr->state);
725 else
726 buf_state = LockBufHdr(bufHdr);
727
728 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
729 {
730 /*
731 * It's now safe to pin the buffer. We can't pin first and ask
732 * questions later, because it might confuse code paths like
733 * InvalidateBuffer() if we pinned a random non-matching buffer.
734 */
735 if (have_private_ref)
736 PinBuffer(bufHdr, NULL); /* bump pin count */
737 else
738 PinBuffer_Locked(bufHdr); /* pin for first time */
739
741
742 return true;
743 }
744
745 /* If we locked the header above, now unlock. */
746 if (!have_private_ref)
747 UnlockBufHdr(bufHdr, buf_state);
748 }
749
750 return false;
751}
752
753/*
754 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
755 * fork with RBM_NORMAL mode and default strategy.
756 */
757Buffer
759{
760 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
761}
762
763/*
764 * ReadBufferExtended -- returns a buffer containing the requested
765 * block of the requested relation. If the blknum
766 * requested is P_NEW, extend the relation file and
767 * allocate a new block. (Caller is responsible for
768 * ensuring that only one backend tries to extend a
769 * relation at the same time!)
770 *
771 * Returns: the buffer number for the buffer containing
772 * the block read. The returned buffer has been pinned.
773 * Does not return on error --- elog's instead.
774 *
775 * Assume when this function is called, that reln has been opened already.
776 *
777 * In RBM_NORMAL mode, the page is read from disk, and the page header is
778 * validated. An error is thrown if the page header is not valid. (But
779 * note that an all-zero page is considered "valid"; see
780 * PageIsVerified().)
781 *
782 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
783 * valid, the page is zeroed instead of throwing an error. This is intended
784 * for non-critical data, where the caller is prepared to repair errors.
785 *
786 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
787 * filled with zeros instead of reading it from disk. Useful when the caller
788 * is going to fill the page from scratch, since this saves I/O and avoids
789 * unnecessary failure if the page-on-disk has corrupt page headers.
790 * The page is returned locked to ensure that the caller has a chance to
791 * initialize the page before it's made visible to others.
792 * Caution: do not use this mode to read a page that is beyond the relation's
793 * current physical EOF; that is likely to cause problems in md.c when
794 * the page is modified and written out. P_NEW is OK, though.
795 *
796 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
797 * a cleanup-strength lock on the page.
798 *
799 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
800 *
801 * If strategy is not NULL, a nondefault buffer access strategy is used.
802 * See buffer/README for details.
803 */
804inline Buffer
807{
808 Buffer buf;
809
810 /*
811 * Reject attempts to read non-local temporary relations; we would be
812 * likely to get wrong data since we have no visibility into the owning
813 * session's local buffers.
814 */
815 if (RELATION_IS_OTHER_TEMP(reln))
817 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
818 errmsg("cannot access temporary tables of other sessions")));
819
820 /*
821 * Read the buffer, and update pgstat counters to reflect a cache hit or
822 * miss.
823 */
824 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
825 forkNum, blockNum, mode, strategy);
826
827 return buf;
828}
829
830
831/*
832 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
833 * a relcache entry for the relation.
834 *
835 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
836 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
837 * cannot be used for temporary relations (and making that work might be
838 * difficult, unless we only want to read temporary relations for our own
839 * ProcNumber).
840 */
841Buffer
844 BufferAccessStrategy strategy, bool permanent)
845{
846 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
847
848 return ReadBuffer_common(NULL, smgr,
849 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
850 forkNum, blockNum,
851 mode, strategy);
852}
853
854/*
855 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
856 */
857Buffer
859 ForkNumber forkNum,
860 BufferAccessStrategy strategy,
861 uint32 flags)
862{
863 Buffer buf;
864 uint32 extend_by = 1;
865
866 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
867 &buf, &extend_by);
868
869 return buf;
870}
871
872/*
873 * Extend relation by multiple blocks.
874 *
875 * Tries to extend the relation by extend_by blocks. Depending on the
876 * availability of resources the relation may end up being extended by a
877 * smaller number of pages (unless an error is thrown, always by at least one
878 * page). *extended_by is updated to the number of pages the relation has been
879 * extended to.
880 *
881 * buffers needs to be an array that is at least extend_by long. Upon
882 * completion, the first extend_by array elements will point to a pinned
883 * buffer.
884 *
885 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
886 * locked. This is useful for callers that want a buffer that is guaranteed to
887 * be empty.
888 */
891 ForkNumber fork,
892 BufferAccessStrategy strategy,
893 uint32 flags,
894 uint32 extend_by,
895 Buffer *buffers,
896 uint32 *extended_by)
897{
898 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
899 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
900 Assert(extend_by > 0);
901
902 if (bmr.smgr == NULL)
903 {
904 bmr.smgr = RelationGetSmgr(bmr.rel);
905 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
906 }
907
908 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
909 extend_by, InvalidBlockNumber,
910 buffers, extended_by);
911}
912
913/*
914 * Extend the relation so it is at least extend_to blocks large, return buffer
915 * (extend_to - 1).
916 *
917 * This is useful for callers that want to write a specific page, regardless
918 * of the current size of the relation (e.g. useful for visibilitymap and for
919 * crash recovery).
920 */
921Buffer
923 ForkNumber fork,
924 BufferAccessStrategy strategy,
925 uint32 flags,
926 BlockNumber extend_to,
928{
930 uint32 extended_by = 0;
932 Buffer buffers[64];
933
934 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
935 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
936 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
937
938 if (bmr.smgr == NULL)
939 {
940 bmr.smgr = RelationGetSmgr(bmr.rel);
941 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
942 }
943
944 /*
945 * If desired, create the file if it doesn't exist. If
946 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
947 * an smgrexists call.
948 */
949 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
950 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
952 !smgrexists(bmr.smgr, fork))
953 {
955
956 /* recheck, fork might have been created concurrently */
957 if (!smgrexists(bmr.smgr, fork))
958 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
959
961 }
962
963 /*
964 * If requested, invalidate size cache, so that smgrnblocks asks the
965 * kernel.
966 */
967 if (flags & EB_CLEAR_SIZE_CACHE)
969
970 /*
971 * Estimate how many pages we'll need to extend by. This avoids acquiring
972 * unnecessarily many victim buffers.
973 */
974 current_size = smgrnblocks(bmr.smgr, fork);
975
976 /*
977 * Since no-one else can be looking at the page contents yet, there is no
978 * difference between an exclusive lock and a cleanup-strength lock. Note
979 * that we pass the original mode to ReadBuffer_common() below, when
980 * falling back to reading the buffer to a concurrent relation extension.
981 */
983 flags |= EB_LOCK_TARGET;
984
985 while (current_size < extend_to)
986 {
987 uint32 num_pages = lengthof(buffers);
988 BlockNumber first_block;
989
990 if ((uint64) current_size + num_pages > extend_to)
991 num_pages = extend_to - current_size;
992
993 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
994 num_pages, extend_to,
995 buffers, &extended_by);
996
997 current_size = first_block + extended_by;
998 Assert(num_pages != 0 || current_size >= extend_to);
999
1000 for (uint32 i = 0; i < extended_by; i++)
1001 {
1002 if (first_block + i != extend_to - 1)
1003 ReleaseBuffer(buffers[i]);
1004 else
1005 buffer = buffers[i];
1006 }
1007 }
1008
1009 /*
1010 * It's possible that another backend concurrently extended the relation.
1011 * In that case read the buffer.
1012 *
1013 * XXX: Should we control this via a flag?
1014 */
1015 if (buffer == InvalidBuffer)
1016 {
1017 Assert(extended_by == 0);
1019 fork, extend_to - 1, mode, strategy);
1020 }
1021
1022 return buffer;
1023}
1024
1025/*
1026 * Lock and optionally zero a buffer, as part of the implementation of
1027 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1028 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1029 */
1030static void
1032{
1033 BufferDesc *bufHdr;
1034 bool need_to_zero;
1035 bool isLocalBuf = BufferIsLocal(buffer);
1036
1038
1039 if (already_valid)
1040 {
1041 /*
1042 * If the caller already knew the buffer was valid, we can skip some
1043 * header interaction. The caller just wants to lock the buffer.
1044 */
1045 need_to_zero = false;
1046 }
1047 else if (isLocalBuf)
1048 {
1049 /* Simple case for non-shared buffers. */
1050 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1051 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1052 }
1053 else
1054 {
1055 /*
1056 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1057 * concurrently. Even though we aren't doing I/O, that ensures that
1058 * we don't zero a page that someone else has pinned. An exclusive
1059 * content lock wouldn't be enough, because readers are allowed to
1060 * drop the content lock after determining that a tuple is visible
1061 * (see buffer access rules in README).
1062 */
1063 bufHdr = GetBufferDescriptor(buffer - 1);
1064 need_to_zero = StartBufferIO(bufHdr, true, false);
1065 }
1066
1067 if (need_to_zero)
1068 {
1069 memset(BufferGetPage(buffer), 0, BLCKSZ);
1070
1071 /*
1072 * Grab the buffer content lock before marking the page as valid, to
1073 * make sure that no other backend sees the zeroed page before the
1074 * caller has had a chance to initialize it.
1075 *
1076 * Since no-one else can be looking at the page contents yet, there is
1077 * no difference between an exclusive lock and a cleanup-strength
1078 * lock. (Note that we cannot use LockBuffer() or
1079 * LockBufferForCleanup() here, because they assert that the buffer is
1080 * already valid.)
1081 */
1082 if (!isLocalBuf)
1084
1085 /* Set BM_VALID, terminate IO, and wake up any waiters */
1086 if (isLocalBuf)
1087 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1088 else
1089 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1090 }
1091 else if (!isLocalBuf)
1092 {
1093 /*
1094 * The buffer is valid, so we can't zero it. The caller still expects
1095 * the page to be locked on return.
1096 */
1097 if (mode == RBM_ZERO_AND_LOCK)
1099 else
1101 }
1102}
1103
1104/*
1105 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1106 * already present, or false if more work is required to either read it in or
1107 * zero it.
1108 */
1111 SMgrRelation smgr,
1112 char persistence,
1113 ForkNumber forkNum,
1114 BlockNumber blockNum,
1115 BufferAccessStrategy strategy,
1116 bool *foundPtr)
1117{
1118 BufferDesc *bufHdr;
1119 IOContext io_context;
1120 IOObject io_object;
1121
1122 Assert(blockNum != P_NEW);
1123
1124 /* Persistence should be set before */
1125 Assert((persistence == RELPERSISTENCE_TEMP ||
1126 persistence == RELPERSISTENCE_PERMANENT ||
1127 persistence == RELPERSISTENCE_UNLOGGED));
1128
1129 if (persistence == RELPERSISTENCE_TEMP)
1130 {
1131 io_context = IOCONTEXT_NORMAL;
1132 io_object = IOOBJECT_TEMP_RELATION;
1133 }
1134 else
1135 {
1136 io_context = IOContextForStrategy(strategy);
1137 io_object = IOOBJECT_RELATION;
1138 }
1139
1140 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1144 smgr->smgr_rlocator.backend);
1145
1146 if (persistence == RELPERSISTENCE_TEMP)
1147 {
1148 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1149 if (*foundPtr)
1151 }
1152 else
1153 {
1154 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1155 strategy, foundPtr, io_context);
1156 if (*foundPtr)
1158 }
1159 if (rel)
1160 {
1161 /*
1162 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1163 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1164 * zeroed instead), the per-relation stats always count them.
1165 */
1167 if (*foundPtr)
1169 }
1170 if (*foundPtr)
1171 {
1172 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1173 if (VacuumCostActive)
1175
1176 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1180 smgr->smgr_rlocator.backend,
1181 true);
1182 }
1183
1184 return BufferDescriptorGetBuffer(bufHdr);
1185}
1186
1187/*
1188 * ReadBuffer_common -- common logic for all ReadBuffer variants
1189 *
1190 * smgr is required, rel is optional unless using P_NEW.
1191 */
1193ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1194 ForkNumber forkNum,
1196 BufferAccessStrategy strategy)
1197{
1198 ReadBuffersOperation operation;
1199 Buffer buffer;
1200 int flags;
1201 char persistence;
1202
1203 /*
1204 * Backward compatibility path, most code should use ExtendBufferedRel()
1205 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1206 * scales a lot better.
1207 */
1208 if (unlikely(blockNum == P_NEW))
1209 {
1211
1212 /*
1213 * Since no-one else can be looking at the page contents yet, there is
1214 * no difference between an exclusive lock and a cleanup-strength
1215 * lock.
1216 */
1218 flags |= EB_LOCK_FIRST;
1219
1220 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1221 }
1222
1223 if (rel)
1224 persistence = rel->rd_rel->relpersistence;
1225 else
1226 persistence = smgr_persistence;
1227
1230 {
1231 bool found;
1232
1233 buffer = PinBufferForBlock(rel, smgr, persistence,
1234 forkNum, blockNum, strategy, &found);
1235 ZeroAndLockBuffer(buffer, mode, found);
1236 return buffer;
1237 }
1238
1239 /*
1240 * Signal that we are going to immediately wait. If we're immediately
1241 * waiting, there is no benefit in actually executing the IO
1242 * asynchronously, it would just add dispatch overhead.
1243 */
1245 if (mode == RBM_ZERO_ON_ERROR)
1247 operation.smgr = smgr;
1248 operation.rel = rel;
1249 operation.persistence = persistence;
1250 operation.forknum = forkNum;
1251 operation.strategy = strategy;
1252 if (StartReadBuffer(&operation,
1253 &buffer,
1254 blockNum,
1255 flags))
1256 WaitReadBuffers(&operation);
1257
1258 return buffer;
1259}
1260
1263 Buffer *buffers,
1264 BlockNumber blockNum,
1265 int *nblocks,
1266 int flags,
1267 bool allow_forwarding)
1268{
1269 int actual_nblocks = *nblocks;
1270 int maxcombine = 0;
1271 bool did_start_io;
1272
1273 Assert(*nblocks == 1 || allow_forwarding);
1274 Assert(*nblocks > 0);
1275 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1276
1277 for (int i = 0; i < actual_nblocks; ++i)
1278 {
1279 bool found;
1280
1281 if (allow_forwarding && buffers[i] != InvalidBuffer)
1282 {
1283 BufferDesc *bufHdr;
1284
1285 /*
1286 * This is a buffer that was pinned by an earlier call to
1287 * StartReadBuffers(), but couldn't be handled in one operation at
1288 * that time. The operation was split, and the caller has passed
1289 * an already pinned buffer back to us to handle the rest of the
1290 * operation. It must continue at the expected block number.
1291 */
1292 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1293
1294 /*
1295 * It might be an already valid buffer (a hit) that followed the
1296 * final contiguous block of an earlier I/O (a miss) marking the
1297 * end of it, or a buffer that some other backend has since made
1298 * valid by performing the I/O for us, in which case we can handle
1299 * it as a hit now. It is safe to check for a BM_VALID flag with
1300 * a relaxed load, because we got a fresh view of it while pinning
1301 * it in the previous call.
1302 *
1303 * On the other hand if we don't see BM_VALID yet, it must be an
1304 * I/O that was split by the previous call and we need to try to
1305 * start a new I/O from this block. We're also racing against any
1306 * other backend that might start the I/O or even manage to mark
1307 * it BM_VALID after this check, but StartBufferIO() will handle
1308 * those cases.
1309 */
1310 if (BufferIsLocal(buffers[i]))
1311 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1312 else
1313 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1315 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1316 }
1317 else
1318 {
1319 buffers[i] = PinBufferForBlock(operation->rel,
1320 operation->smgr,
1321 operation->persistence,
1322 operation->forknum,
1323 blockNum + i,
1324 operation->strategy,
1325 &found);
1326 }
1327
1328 if (found)
1329 {
1330 /*
1331 * We have a hit. If it's the first block in the requested range,
1332 * we can return it immediately and report that WaitReadBuffers()
1333 * does not need to be called. If the initial value of *nblocks
1334 * was larger, the caller will have to call again for the rest.
1335 */
1336 if (i == 0)
1337 {
1338 *nblocks = 1;
1339
1340#ifdef USE_ASSERT_CHECKING
1341
1342 /*
1343 * Initialize enough of ReadBuffersOperation to make
1344 * CheckReadBuffersOperation() work. Outside of assertions
1345 * that's not necessary when no IO is issued.
1346 */
1347 operation->buffers = buffers;
1348 operation->blocknum = blockNum;
1349 operation->nblocks = 1;
1350 operation->nblocks_done = 1;
1351 CheckReadBuffersOperation(operation, true);
1352#endif
1353 return false;
1354 }
1355
1356 /*
1357 * Otherwise we already have an I/O to perform, but this block
1358 * can't be included as it is already valid. Split the I/O here.
1359 * There may or may not be more blocks requiring I/O after this
1360 * one, we haven't checked, but they can't be contiguous with this
1361 * one in the way. We'll leave this buffer pinned, forwarding it
1362 * to the next call, avoiding the need to unpin it here and re-pin
1363 * it in the next call.
1364 */
1365 actual_nblocks = i;
1366 break;
1367 }
1368 else
1369 {
1370 /*
1371 * Check how many blocks we can cover with the same IO. The smgr
1372 * implementation might e.g. be limited due to a segment boundary.
1373 */
1374 if (i == 0 && actual_nblocks > 1)
1375 {
1376 maxcombine = smgrmaxcombine(operation->smgr,
1377 operation->forknum,
1378 blockNum);
1379 if (unlikely(maxcombine < actual_nblocks))
1380 {
1381 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1382 blockNum, actual_nblocks, maxcombine);
1383 actual_nblocks = maxcombine;
1384 }
1385 }
1386 }
1387 }
1388 *nblocks = actual_nblocks;
1389
1390 /* Populate information needed for I/O. */
1391 operation->buffers = buffers;
1392 operation->blocknum = blockNum;
1393 operation->flags = flags;
1394 operation->nblocks = actual_nblocks;
1395 operation->nblocks_done = 0;
1396 pgaio_wref_clear(&operation->io_wref);
1397
1398 /*
1399 * When using AIO, start the IO in the background. If not, issue prefetch
1400 * requests if desired by the caller.
1401 *
1402 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1403 * de-risk the introduction of AIO somewhat. It's a large architectural
1404 * change, with lots of chances for unanticipated performance effects.
1405 *
1406 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1407 * asynchronously, but without the check here we'd execute IO earlier than
1408 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1409 */
1410 if (io_method != IOMETHOD_SYNC)
1411 {
1412 /*
1413 * Try to start IO asynchronously. It's possible that no IO needs to
1414 * be started, if another backend already performed the IO.
1415 *
1416 * Note that if an IO is started, it might not cover the entire
1417 * requested range, e.g. because an intermediary block has been read
1418 * in by another backend. In that case any "trailing" buffers we
1419 * already pinned above will be "forwarded" by read_stream.c to the
1420 * next call to StartReadBuffers().
1421 *
1422 * This is signalled to the caller by decrementing *nblocks *and*
1423 * reducing operation->nblocks. The latter is done here, but not below
1424 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1425 * overall read size anymore, we need to retry until done in its
1426 * entirety or until failed.
1427 */
1428 did_start_io = AsyncReadBuffers(operation, nblocks);
1429
1430 operation->nblocks = *nblocks;
1431 }
1432 else
1433 {
1434 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1435
1436 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1437 {
1438 /*
1439 * In theory we should only do this if PinBufferForBlock() had to
1440 * allocate new buffers above. That way, if two calls to
1441 * StartReadBuffers() were made for the same blocks before
1442 * WaitReadBuffers(), only the first would issue the advice.
1443 * That'd be a better simulation of true asynchronous I/O, which
1444 * would only start the I/O once, but isn't done here for
1445 * simplicity.
1446 */
1447 smgrprefetch(operation->smgr,
1448 operation->forknum,
1449 blockNum,
1450 actual_nblocks);
1451 }
1452
1453 /*
1454 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1455 * will initiate the necessary IO.
1456 */
1457 did_start_io = true;
1458 }
1459
1460 CheckReadBuffersOperation(operation, !did_start_io);
1461
1462 return did_start_io;
1463}
1464
1465/*
1466 * Begin reading a range of blocks beginning at blockNum and extending for
1467 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1468 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1469 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1470 * and is now being continued. On return, *nblocks holds the number of blocks
1471 * accepted by this operation. If it is less than the original number then
1472 * this operation has been split, but buffer elements up to the original
1473 * requested size may hold forwarded buffers to be used for a continuing
1474 * operation. The caller must either start a new I/O beginning at the block
1475 * immediately following the blocks accepted by this call and pass those
1476 * buffers back in, or release them if it chooses not to. It shouldn't make
1477 * any other use of or assumptions about forwarded buffers.
1478 *
1479 * If false is returned, no I/O is necessary and the buffers covered by
1480 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1481 * an I/O has been started, and WaitReadBuffers() must be called with the same
1482 * operation object before the buffers covered by *nblocks on exit can be
1483 * accessed. Along with the operation object, the caller-supplied array of
1484 * buffers must remain valid until WaitReadBuffers() is called, and any
1485 * forwarded buffers must also be preserved for a continuing call unless
1486 * they are explicitly released.
1487 *
1488 * Currently the I/O is only started with optional operating system advice if
1489 * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1490 * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1491 * be initiated here.
1492 */
1493bool
1495 Buffer *buffers,
1496 BlockNumber blockNum,
1497 int *nblocks,
1498 int flags)
1499{
1500 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1501 true /* expect forwarded buffers */ );
1502}
1503
1504/*
1505 * Single block version of the StartReadBuffers(). This might save a few
1506 * instructions when called from another translation unit, because it is
1507 * specialized for nblocks == 1.
1508 *
1509 * This version does not support "forwarded" buffers: they cannot be created
1510 * by reading only one block and *buffer is ignored on entry.
1511 */
1512bool
1514 Buffer *buffer,
1515 BlockNumber blocknum,
1516 int flags)
1517{
1518 int nblocks = 1;
1519 bool result;
1520
1521 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1522 false /* single block, no forwarding */ );
1523 Assert(nblocks == 1); /* single block can't be short */
1524
1525 return result;
1526}
1527
1528/*
1529 * Perform sanity checks on the ReadBuffersOperation.
1530 */
1531static void
1533{
1534#ifdef USE_ASSERT_CHECKING
1535 Assert(operation->nblocks_done <= operation->nblocks);
1536 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1537
1538 for (int i = 0; i < operation->nblocks; i++)
1539 {
1540 Buffer buffer = operation->buffers[i];
1541 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1544
1545 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1547
1548 if (i < operation->nblocks_done)
1550 }
1551#endif
1552}
1553
1554/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1555static inline bool
1557{
1558 if (BufferIsLocal(buffer))
1560 true, nowait);
1561 else
1562 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1563}
1564
1565/*
1566 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1567 */
1568static inline bool
1570{
1571 /*
1572 * If this backend currently has staged IO, we need to submit the pending
1573 * IO before waiting for the right to issue IO, to avoid the potential for
1574 * deadlocks (and, more commonly, unnecessary delays for other backends).
1575 */
1576 if (!nowait && pgaio_have_staged())
1577 {
1579 return true;
1580
1581 /*
1582 * Unfortunately StartBufferIO() returning false doesn't allow to
1583 * distinguish between the buffer already being valid and IO already
1584 * being in progress. Since IO already being in progress is quite
1585 * rare, this approach seems fine.
1586 */
1588 }
1589
1590 return ReadBuffersCanStartIOOnce(buffer, nowait);
1591}
1592
1593/*
1594 * Helper for WaitReadBuffers() that processes the results of a readv
1595 * operation, raising an error if necessary.
1596 */
1597static void
1599{
1600 PgAioReturn *aio_ret = &operation->io_return;
1601 PgAioResultStatus rs = aio_ret->result.status;
1602 int newly_read_blocks = 0;
1603
1604 Assert(pgaio_wref_valid(&operation->io_wref));
1605 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1606
1607 /*
1608 * SMGR reports the number of blocks successfully read as the result of
1609 * the IO operation. Thus we can simply add that to ->nblocks_done.
1610 */
1611
1612 if (likely(rs != PGAIO_RS_ERROR))
1613 newly_read_blocks = aio_ret->result.result;
1614
1615 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1616 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1617 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1618 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1619 {
1620 /*
1621 * We'll retry, so we just emit a debug message to the server log (or
1622 * not even that in prod scenarios).
1623 */
1624 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1625 elog(DEBUG3, "partial read, will retry");
1626 }
1627
1628 Assert(newly_read_blocks > 0);
1629 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1630
1631 operation->nblocks_done += newly_read_blocks;
1632
1633 Assert(operation->nblocks_done <= operation->nblocks);
1634}
1635
1636void
1638{
1639 PgAioReturn *aio_ret = &operation->io_return;
1640 IOContext io_context;
1641 IOObject io_object;
1642
1643 if (operation->persistence == RELPERSISTENCE_TEMP)
1644 {
1645 io_context = IOCONTEXT_NORMAL;
1646 io_object = IOOBJECT_TEMP_RELATION;
1647 }
1648 else
1649 {
1650 io_context = IOContextForStrategy(operation->strategy);
1651 io_object = IOOBJECT_RELATION;
1652 }
1653
1654 /*
1655 * If we get here without an IO operation having been issued, the
1656 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1657 * caller should not have called WaitReadBuffers().
1658 *
1659 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1660 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1661 * of the retry logic below, no extra code is required.
1662 *
1663 * This path is expected to eventually go away.
1664 */
1665 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1666 elog(ERROR, "waiting for read operation that didn't read");
1667
1668 /*
1669 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1670 * done. We may need multiple retries, not just because we could get
1671 * multiple partial reads, but also because some of the remaining
1672 * to-be-read buffers may have been read in by other backends, limiting
1673 * the IO size.
1674 */
1675 while (true)
1676 {
1677 int ignored_nblocks_progress;
1678
1679 CheckReadBuffersOperation(operation, false);
1680
1681 /*
1682 * If there is an IO associated with the operation, we may need to
1683 * wait for it.
1684 */
1685 if (pgaio_wref_valid(&operation->io_wref))
1686 {
1687 /*
1688 * Track the time spent waiting for the IO to complete. As
1689 * tracking a wait even if we don't actually need to wait
1690 *
1691 * a) is not cheap, due to the timestamping overhead
1692 *
1693 * b) reports some time as waiting, even if we never waited
1694 *
1695 * we first check if we already know the IO is complete.
1696 */
1697 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1698 !pgaio_wref_check_done(&operation->io_wref))
1699 {
1701
1702 pgaio_wref_wait(&operation->io_wref);
1703
1704 /*
1705 * The IO operation itself was already counted earlier, in
1706 * AsyncReadBuffers(), this just accounts for the wait time.
1707 */
1708 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1709 io_start, 0, 0);
1710 }
1711 else
1712 {
1713 Assert(pgaio_wref_check_done(&operation->io_wref));
1714 }
1715
1716 /*
1717 * We now are sure the IO completed. Check the results. This
1718 * includes reporting on errors if there were any.
1719 */
1720 ProcessReadBuffersResult(operation);
1721 }
1722
1723 /*
1724 * Most of the time, the one IO we already started, will read in
1725 * everything. But we need to deal with partial reads and buffers not
1726 * needing IO anymore.
1727 */
1728 if (operation->nblocks_done == operation->nblocks)
1729 break;
1730
1732
1733 /*
1734 * This may only complete the IO partially, either because some
1735 * buffers were already valid, or because of a partial read.
1736 *
1737 * NB: In contrast to after the AsyncReadBuffers() call in
1738 * StartReadBuffers(), we do *not* reduce
1739 * ReadBuffersOperation->nblocks here, callers expect the full
1740 * operation to be completed at this point (as more operations may
1741 * have been queued).
1742 */
1743 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1744 }
1745
1746 CheckReadBuffersOperation(operation, true);
1747
1748 /* NB: READ_DONE tracepoint was already executed in completion callback */
1749}
1750
1751/*
1752 * Initiate IO for the ReadBuffersOperation
1753 *
1754 * This function only starts a single IO at a time. The size of the IO may be
1755 * limited to below the to-be-read blocks, if one of the buffers has
1756 * concurrently been read in. If the first to-be-read buffer is already valid,
1757 * no IO will be issued.
1758 *
1759 * To support retries after partial reads, the first operation->nblocks_done
1760 * buffers are skipped.
1761 *
1762 * On return *nblocks_progress is updated to reflect the number of buffers
1763 * affected by the call. If the first buffer is valid, *nblocks_progress is
1764 * set to 1 and operation->nblocks_done is incremented.
1765 *
1766 * Returns true if IO was initiated, false if no IO was necessary.
1767 */
1768static bool
1769AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1770{
1771 Buffer *buffers = &operation->buffers[0];
1772 int flags = operation->flags;
1773 BlockNumber blocknum = operation->blocknum;
1774 ForkNumber forknum = operation->forknum;
1775 char persistence = operation->persistence;
1776 int16 nblocks_done = operation->nblocks_done;
1777 Buffer *io_buffers = &operation->buffers[nblocks_done];
1778 int io_buffers_len = 0;
1779 PgAioHandle *ioh;
1780 uint32 ioh_flags = 0;
1781 void *io_pages[MAX_IO_COMBINE_LIMIT];
1782 IOContext io_context;
1783 IOObject io_object;
1784 bool did_start_io;
1785
1786 /*
1787 * When this IO is executed synchronously, either because the caller will
1788 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1789 * the AIO subsystem needs to know.
1790 */
1791 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1792 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1793
1794 if (persistence == RELPERSISTENCE_TEMP)
1795 {
1796 io_context = IOCONTEXT_NORMAL;
1797 io_object = IOOBJECT_TEMP_RELATION;
1798 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1799 }
1800 else
1801 {
1802 io_context = IOContextForStrategy(operation->strategy);
1803 io_object = IOOBJECT_RELATION;
1804 }
1805
1806 /*
1807 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1808 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1809 * set globally, but on a per-session basis. The completion callback,
1810 * which may be run in other processes, e.g. in IO workers, may have a
1811 * different value of the zero_damaged_pages GUC.
1812 *
1813 * XXX: We probably should eventually use a different flag for
1814 * zero_damaged_pages, so we can report different log levels / error codes
1815 * for zero_damaged_pages and ZERO_ON_ERROR.
1816 */
1819
1820 /*
1821 * For the same reason as with zero_damaged_pages we need to use this
1822 * backend's ignore_checksum_failure value.
1823 */
1826
1827
1828 /*
1829 * To be allowed to report stats in the local completion callback we need
1830 * to prepare to report stats now. This ensures we can safely report the
1831 * checksum failure even in a critical section.
1832 */
1834
1835 /*
1836 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1837 * might block, which we don't want after setting IO_IN_PROGRESS.
1838 *
1839 * If we need to wait for IO before we can get a handle, submit
1840 * already-staged IO first, so that other backends don't need to wait.
1841 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1842 * wait for already submitted IO, which doesn't require additional locks,
1843 * but it could still cause undesirable waits.
1844 *
1845 * A secondary benefit is that this would allow us to measure the time in
1846 * pgaio_io_acquire() without causing undue timer overhead in the common,
1847 * non-blocking, case. However, currently the pgstats infrastructure
1848 * doesn't really allow that, as it a) asserts that an operation can't
1849 * have time without operations b) doesn't have an API to report
1850 * "accumulated" time.
1851 */
1853 if (unlikely(!ioh))
1854 {
1856
1858 }
1859
1860 /*
1861 * Check if we can start IO on the first to-be-read buffer.
1862 *
1863 * If an I/O is already in progress in another backend, we want to wait
1864 * for the outcome: either done, or something went wrong and we will
1865 * retry.
1866 */
1867 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1868 {
1869 /*
1870 * Someone else has already completed this block, we're done.
1871 *
1872 * When IO is necessary, ->nblocks_done is updated in
1873 * ProcessReadBuffersResult(), but that is not called if no IO is
1874 * necessary. Thus update here.
1875 */
1876 operation->nblocks_done += 1;
1877 *nblocks_progress = 1;
1878
1879 pgaio_io_release(ioh);
1880 pgaio_wref_clear(&operation->io_wref);
1881 did_start_io = false;
1882
1883 /*
1884 * Report and track this as a 'hit' for this backend, even though it
1885 * must have started out as a miss in PinBufferForBlock(). The other
1886 * backend will track this as a 'read'.
1887 */
1888 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1889 operation->smgr->smgr_rlocator.locator.spcOid,
1890 operation->smgr->smgr_rlocator.locator.dbOid,
1891 operation->smgr->smgr_rlocator.locator.relNumber,
1892 operation->smgr->smgr_rlocator.backend,
1893 true);
1894
1895 if (persistence == RELPERSISTENCE_TEMP)
1897 else
1899
1900 if (operation->rel)
1901 pgstat_count_buffer_hit(operation->rel);
1902
1903 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1904
1905 if (VacuumCostActive)
1907 }
1908 else
1909 {
1910 instr_time io_start;
1911
1912 /* We found a buffer that we need to read in. */
1913 Assert(io_buffers[0] == buffers[nblocks_done]);
1914 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1915 io_buffers_len = 1;
1916
1917 /*
1918 * How many neighboring-on-disk blocks can we scatter-read into other
1919 * buffers at the same time? In this case we don't wait if we see an
1920 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1921 * head block, so we should get on with that I/O as soon as possible.
1922 */
1923 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1924 {
1925 if (!ReadBuffersCanStartIO(buffers[i], true))
1926 break;
1927 /* Must be consecutive block numbers. */
1928 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1929 BufferGetBlockNumber(buffers[i]) - 1);
1930 Assert(io_buffers[io_buffers_len] == buffers[i]);
1931
1932 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1933 }
1934
1935 /* get a reference to wait for in WaitReadBuffers() */
1936 pgaio_io_get_wref(ioh, &operation->io_wref);
1937
1938 /* provide the list of buffers to the completion callbacks */
1939 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1940
1942 persistence == RELPERSISTENCE_TEMP ?
1945 flags);
1946
1947 pgaio_io_set_flag(ioh, ioh_flags);
1948
1949 /* ---
1950 * Even though we're trying to issue IO asynchronously, track the time
1951 * in smgrstartreadv():
1952 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1953 * immediately
1954 * - the io method might not support the IO (e.g. worker IO for a temp
1955 * table)
1956 * ---
1957 */
1959 smgrstartreadv(ioh, operation->smgr, forknum,
1960 blocknum + nblocks_done,
1961 io_pages, io_buffers_len);
1962 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1963 io_start, 1, io_buffers_len * BLCKSZ);
1964
1965 if (persistence == RELPERSISTENCE_TEMP)
1966 pgBufferUsage.local_blks_read += io_buffers_len;
1967 else
1968 pgBufferUsage.shared_blks_read += io_buffers_len;
1969
1970 /*
1971 * Track vacuum cost when issuing IO, not after waiting for it.
1972 * Otherwise we could end up issuing a lot of IO in a short timespan,
1973 * despite a low cost limit.
1974 */
1975 if (VacuumCostActive)
1976 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1977
1978 *nblocks_progress = io_buffers_len;
1979 did_start_io = true;
1980 }
1981
1982 return did_start_io;
1983}
1984
1985/*
1986 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1987 * buffer. If no buffer exists already, selects a replacement victim and
1988 * evicts the old page, but does NOT read in new page.
1989 *
1990 * "strategy" can be a buffer replacement strategy object, or NULL for
1991 * the default strategy. The selected buffer's usage_count is advanced when
1992 * using the default strategy, but otherwise possibly not (see PinBuffer).
1993 *
1994 * The returned buffer is pinned and is already marked as holding the
1995 * desired page. If it already did have the desired page, *foundPtr is
1996 * set true. Otherwise, *foundPtr is set false.
1997 *
1998 * io_context is passed as an output parameter to avoid calling
1999 * IOContextForStrategy() when there is a shared buffers hit and no IO
2000 * statistics need be captured.
2001 *
2002 * No locks are held either at entry or exit.
2003 */
2005BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2006 BlockNumber blockNum,
2007 BufferAccessStrategy strategy,
2008 bool *foundPtr, IOContext io_context)
2009{
2010 BufferTag newTag; /* identity of requested block */
2011 uint32 newHash; /* hash value for newTag */
2012 LWLock *newPartitionLock; /* buffer partition lock for it */
2013 int existing_buf_id;
2014 Buffer victim_buffer;
2015 BufferDesc *victim_buf_hdr;
2016 uint32 victim_buf_state;
2017
2018 /* Make sure we will have room to remember the buffer pin */
2021
2022 /* create a tag so we can lookup the buffer */
2023 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2024
2025 /* determine its hash code and partition lock ID */
2026 newHash = BufTableHashCode(&newTag);
2027 newPartitionLock = BufMappingPartitionLock(newHash);
2028
2029 /* see if the block is in the buffer pool already */
2030 LWLockAcquire(newPartitionLock, LW_SHARED);
2031 existing_buf_id = BufTableLookup(&newTag, newHash);
2032 if (existing_buf_id >= 0)
2033 {
2034 BufferDesc *buf;
2035 bool valid;
2036
2037 /*
2038 * Found it. Now, pin the buffer so no one can steal it from the
2039 * buffer pool, and check to see if the correct data has been loaded
2040 * into the buffer.
2041 */
2042 buf = GetBufferDescriptor(existing_buf_id);
2043
2044 valid = PinBuffer(buf, strategy);
2045
2046 /* Can release the mapping lock as soon as we've pinned it */
2047 LWLockRelease(newPartitionLock);
2048
2049 *foundPtr = true;
2050
2051 if (!valid)
2052 {
2053 /*
2054 * We can only get here if (a) someone else is still reading in
2055 * the page, (b) a previous read attempt failed, or (c) someone
2056 * called StartReadBuffers() but not yet WaitReadBuffers().
2057 */
2058 *foundPtr = false;
2059 }
2060
2061 return buf;
2062 }
2063
2064 /*
2065 * Didn't find it in the buffer pool. We'll have to initialize a new
2066 * buffer. Remember to unlock the mapping lock while doing the work.
2067 */
2068 LWLockRelease(newPartitionLock);
2069
2070 /*
2071 * Acquire a victim buffer. Somebody else might try to do the same, we
2072 * don't hold any conflicting locks. If so we'll have to undo our work
2073 * later.
2074 */
2075 victim_buffer = GetVictimBuffer(strategy, io_context);
2076 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2077
2078 /*
2079 * Try to make a hashtable entry for the buffer under its new tag. If
2080 * somebody else inserted another buffer for the tag, we'll release the
2081 * victim buffer we acquired and use the already inserted one.
2082 */
2083 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2084 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2085 if (existing_buf_id >= 0)
2086 {
2087 BufferDesc *existing_buf_hdr;
2088 bool valid;
2089
2090 /*
2091 * Got a collision. Someone has already done what we were about to do.
2092 * We'll just handle this as if it were found in the buffer pool in
2093 * the first place. First, give up the buffer we were planning to
2094 * use.
2095 *
2096 * We could do this after releasing the partition lock, but then we'd
2097 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2098 * before acquiring the lock, for the rare case of such a collision.
2099 */
2100 UnpinBuffer(victim_buf_hdr);
2101
2102 /*
2103 * The victim buffer we acquired previously is clean and unused, let
2104 * it be found again quickly
2105 */
2106 StrategyFreeBuffer(victim_buf_hdr);
2107
2108 /* remaining code should match code at top of routine */
2109
2110 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2111
2112 valid = PinBuffer(existing_buf_hdr, strategy);
2113
2114 /* Can release the mapping lock as soon as we've pinned it */
2115 LWLockRelease(newPartitionLock);
2116
2117 *foundPtr = true;
2118
2119 if (!valid)
2120 {
2121 /*
2122 * We can only get here if (a) someone else is still reading in
2123 * the page, (b) a previous read attempt failed, or (c) someone
2124 * called StartReadBuffers() but not yet WaitReadBuffers().
2125 */
2126 *foundPtr = false;
2127 }
2128
2129 return existing_buf_hdr;
2130 }
2131
2132 /*
2133 * Need to lock the buffer header too in order to change its tag.
2134 */
2135 victim_buf_state = LockBufHdr(victim_buf_hdr);
2136
2137 /* some sanity checks while we hold the buffer header lock */
2138 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2139 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2140
2141 victim_buf_hdr->tag = newTag;
2142
2143 /*
2144 * Make sure BM_PERMANENT is set for buffers that must be written at every
2145 * checkpoint. Unlogged buffers only need to be written at shutdown
2146 * checkpoints, except for their "init" forks, which need to be treated
2147 * just like permanent relations.
2148 */
2149 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2150 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2151 victim_buf_state |= BM_PERMANENT;
2152
2153 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2154
2155 LWLockRelease(newPartitionLock);
2156
2157 /*
2158 * Buffer contents are currently invalid.
2159 */
2160 *foundPtr = false;
2161
2162 return victim_buf_hdr;
2163}
2164
2165/*
2166 * InvalidateBuffer -- mark a shared buffer invalid and return it to the
2167 * freelist.
2168 *
2169 * The buffer header spinlock must be held at entry. We drop it before
2170 * returning. (This is sane because the caller must have locked the
2171 * buffer in order to be sure it should be dropped.)
2172 *
2173 * This is used only in contexts such as dropping a relation. We assume
2174 * that no other backend could possibly be interested in using the page,
2175 * so the only reason the buffer might be pinned is if someone else is
2176 * trying to write it out. We have to let them finish before we can
2177 * reclaim the buffer.
2178 *
2179 * The buffer could get reclaimed by someone else while we are waiting
2180 * to acquire the necessary locks; if so, don't mess it up.
2181 */
2182static void
2184{
2185 BufferTag oldTag;
2186 uint32 oldHash; /* hash value for oldTag */
2187 LWLock *oldPartitionLock; /* buffer partition lock for it */
2188 uint32 oldFlags;
2189 uint32 buf_state;
2190
2191 /* Save the original buffer tag before dropping the spinlock */
2192 oldTag = buf->tag;
2193
2194 buf_state = pg_atomic_read_u32(&buf->state);
2195 Assert(buf_state & BM_LOCKED);
2196 UnlockBufHdr(buf, buf_state);
2197
2198 /*
2199 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2200 * worth storing the hashcode in BufferDesc so we need not recompute it
2201 * here? Probably not.
2202 */
2203 oldHash = BufTableHashCode(&oldTag);
2204 oldPartitionLock = BufMappingPartitionLock(oldHash);
2205
2206retry:
2207
2208 /*
2209 * Acquire exclusive mapping lock in preparation for changing the buffer's
2210 * association.
2211 */
2212 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2213
2214 /* Re-lock the buffer header */
2215 buf_state = LockBufHdr(buf);
2216
2217 /* If it's changed while we were waiting for lock, do nothing */
2218 if (!BufferTagsEqual(&buf->tag, &oldTag))
2219 {
2220 UnlockBufHdr(buf, buf_state);
2221 LWLockRelease(oldPartitionLock);
2222 return;
2223 }
2224
2225 /*
2226 * We assume the reason for it to be pinned is that either we were
2227 * asynchronously reading the page in before erroring out or someone else
2228 * is flushing the page out. Wait for the IO to finish. (This could be
2229 * an infinite loop if the refcount is messed up... it would be nice to
2230 * time out after awhile, but there seems no way to be sure how many loops
2231 * may be needed. Note that if the other guy has pinned the buffer but
2232 * not yet done StartBufferIO, WaitIO will fall through and we'll
2233 * effectively be busy-looping here.)
2234 */
2235 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2236 {
2237 UnlockBufHdr(buf, buf_state);
2238 LWLockRelease(oldPartitionLock);
2239 /* safety check: should definitely not be our *own* pin */
2241 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2242 WaitIO(buf);
2243 goto retry;
2244 }
2245
2246 /*
2247 * Clear out the buffer's tag and flags. We must do this to ensure that
2248 * linear scans of the buffer array don't think the buffer is valid.
2249 */
2250 oldFlags = buf_state & BUF_FLAG_MASK;
2251 ClearBufferTag(&buf->tag);
2252 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2253 UnlockBufHdr(buf, buf_state);
2254
2255 /*
2256 * Remove the buffer from the lookup hashtable, if it was in there.
2257 */
2258 if (oldFlags & BM_TAG_VALID)
2259 BufTableDelete(&oldTag, oldHash);
2260
2261 /*
2262 * Done with mapping lock.
2263 */
2264 LWLockRelease(oldPartitionLock);
2265
2266 /*
2267 * Insert the buffer at the head of the list of free buffers.
2268 */
2270}
2271
2272/*
2273 * Helper routine for GetVictimBuffer()
2274 *
2275 * Needs to be called on a buffer with a valid tag, pinned, but without the
2276 * buffer header spinlock held.
2277 *
2278 * Returns true if the buffer can be reused, in which case the buffer is only
2279 * pinned by this backend and marked as invalid, false otherwise.
2280 */
2281static bool
2283{
2284 uint32 buf_state;
2285 uint32 hash;
2286 LWLock *partition_lock;
2287 BufferTag tag;
2288
2290
2291 /* have buffer pinned, so it's safe to read tag without lock */
2292 tag = buf_hdr->tag;
2293
2294 hash = BufTableHashCode(&tag);
2295 partition_lock = BufMappingPartitionLock(hash);
2296
2297 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2298
2299 /* lock the buffer header */
2300 buf_state = LockBufHdr(buf_hdr);
2301
2302 /*
2303 * We have the buffer pinned nobody else should have been able to unset
2304 * this concurrently.
2305 */
2306 Assert(buf_state & BM_TAG_VALID);
2307 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2308 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2309
2310 /*
2311 * If somebody else pinned the buffer since, or even worse, dirtied it,
2312 * give up on this buffer: It's clearly in use.
2313 */
2314 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2315 {
2316 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2317
2318 UnlockBufHdr(buf_hdr, buf_state);
2319 LWLockRelease(partition_lock);
2320
2321 return false;
2322 }
2323
2324 /*
2325 * Clear out the buffer's tag and flags and usagecount. This is not
2326 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2327 * doing anything with the buffer. But currently it's beneficial, as the
2328 * cheaper pre-check for several linear scans of shared buffers use the
2329 * tag (see e.g. FlushDatabaseBuffers()).
2330 */
2331 ClearBufferTag(&buf_hdr->tag);
2332 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2333 UnlockBufHdr(buf_hdr, buf_state);
2334
2335 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2336
2337 /* finally delete buffer from the buffer mapping table */
2338 BufTableDelete(&tag, hash);
2339
2340 LWLockRelease(partition_lock);
2341
2342 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2343 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2345
2346 return true;
2347}
2348
2349static Buffer
2351{
2352 BufferDesc *buf_hdr;
2353 Buffer buf;
2354 uint32 buf_state;
2355 bool from_ring;
2356
2357 /*
2358 * Ensure, while the spinlock's not yet held, that there's a free refcount
2359 * entry, and a resource owner slot for the pin.
2360 */
2363
2364 /* we return here if a prospective victim buffer gets used concurrently */
2365again:
2366
2367 /*
2368 * Select a victim buffer. The buffer is returned with its header
2369 * spinlock still held!
2370 */
2371 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2372 buf = BufferDescriptorGetBuffer(buf_hdr);
2373
2374 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2375
2376 /* Pin the buffer and then release the buffer spinlock */
2377 PinBuffer_Locked(buf_hdr);
2378
2379 /*
2380 * We shouldn't have any other pins for this buffer.
2381 */
2383
2384 /*
2385 * If the buffer was dirty, try to write it out. There is a race
2386 * condition here, in that someone might dirty it after we released the
2387 * buffer header lock above, or even while we are writing it out (since
2388 * our share-lock won't prevent hint-bit updates). We will recheck the
2389 * dirty bit after re-locking the buffer header.
2390 */
2391 if (buf_state & BM_DIRTY)
2392 {
2393 LWLock *content_lock;
2394
2395 Assert(buf_state & BM_TAG_VALID);
2396 Assert(buf_state & BM_VALID);
2397
2398 /*
2399 * We need a share-lock on the buffer contents to write it out (else
2400 * we might write invalid data, eg because someone else is compacting
2401 * the page contents while we write). We must use a conditional lock
2402 * acquisition here to avoid deadlock. Even though the buffer was not
2403 * pinned (and therefore surely not locked) when StrategyGetBuffer
2404 * returned it, someone else could have pinned and exclusive-locked it
2405 * by the time we get here. If we try to get the lock unconditionally,
2406 * we'd block waiting for them; if they later block waiting for us,
2407 * deadlock ensues. (This has been observed to happen when two
2408 * backends are both trying to split btree index pages, and the second
2409 * one just happens to be trying to split the page the first one got
2410 * from StrategyGetBuffer.)
2411 */
2412 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2413 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2414 {
2415 /*
2416 * Someone else has locked the buffer, so give it up and loop back
2417 * to get another one.
2418 */
2419 UnpinBuffer(buf_hdr);
2420 goto again;
2421 }
2422
2423 /*
2424 * If using a nondefault strategy, and writing the buffer would
2425 * require a WAL flush, let the strategy decide whether to go ahead
2426 * and write/reuse the buffer or to choose another victim. We need a
2427 * lock to inspect the page LSN, so this can't be done inside
2428 * StrategyGetBuffer.
2429 */
2430 if (strategy != NULL)
2431 {
2432 XLogRecPtr lsn;
2433
2434 /* Read the LSN while holding buffer header lock */
2435 buf_state = LockBufHdr(buf_hdr);
2436 lsn = BufferGetLSN(buf_hdr);
2437 UnlockBufHdr(buf_hdr, buf_state);
2438
2439 if (XLogNeedsFlush(lsn)
2440 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2441 {
2442 LWLockRelease(content_lock);
2443 UnpinBuffer(buf_hdr);
2444 goto again;
2445 }
2446 }
2447
2448 /* OK, do the I/O */
2449 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2450 LWLockRelease(content_lock);
2451
2453 &buf_hdr->tag);
2454 }
2455
2456
2457 if (buf_state & BM_VALID)
2458 {
2459 /*
2460 * When a BufferAccessStrategy is in use, blocks evicted from shared
2461 * buffers are counted as IOOP_EVICT in the corresponding context
2462 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2463 * strategy in two cases: 1) while initially claiming buffers for the
2464 * strategy ring 2) to replace an existing strategy ring buffer
2465 * because it is pinned or in use and cannot be reused.
2466 *
2467 * Blocks evicted from buffers already in the strategy ring are
2468 * counted as IOOP_REUSE in the corresponding strategy context.
2469 *
2470 * At this point, we can accurately count evictions and reuses,
2471 * because we have successfully claimed the valid buffer. Previously,
2472 * we may have been forced to release the buffer due to concurrent
2473 * pinners or erroring out.
2474 */
2476 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2477 }
2478
2479 /*
2480 * If the buffer has an entry in the buffer mapping table, delete it. This
2481 * can fail because another backend could have pinned or dirtied the
2482 * buffer.
2483 */
2484 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2485 {
2486 UnpinBuffer(buf_hdr);
2487 goto again;
2488 }
2489
2490 /* a final set of sanity checks */
2491#ifdef USE_ASSERT_CHECKING
2492 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2493
2494 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2495 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2496
2498#endif
2499
2500 return buf;
2501}
2502
2503/*
2504 * Return the maximum number of buffers that a backend should try to pin once,
2505 * to avoid exceeding its fair share. This is the highest value that
2506 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2507 * system with a very small buffer pool relative to max_connections.
2508 */
2509uint32
2511{
2512 return MaxProportionalPins;
2513}
2514
2515/*
2516 * Return the maximum number of additional buffers that this backend should
2517 * pin if it wants to stay under the per-backend limit, considering the number
2518 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2519 * return by this function can be zero.
2520 */
2521uint32
2523{
2524 uint32 estimated_pins_held;
2525
2526 /*
2527 * We get the number of "overflowed" pins for free, but don't know the
2528 * number of pins in PrivateRefCountArray. The cost of calculating that
2529 * exactly doesn't seem worth it, so just assume the max.
2530 */
2531 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2532
2533 /* Is this backend already holding more than its fair share? */
2534 if (estimated_pins_held > MaxProportionalPins)
2535 return 0;
2536
2537 return MaxProportionalPins - estimated_pins_held;
2538}
2539
2540/*
2541 * Limit the number of pins a batch operation may additionally acquire, to
2542 * avoid running out of pinnable buffers.
2543 *
2544 * One additional pin is always allowed, on the assumption that the operation
2545 * requires at least one to make progress.
2546 */
2547void
2549{
2550 uint32 limit;
2551
2552 if (*additional_pins <= 1)
2553 return;
2554
2555 limit = GetAdditionalPinLimit();
2556 limit = Max(limit, 1);
2557 if (limit < *additional_pins)
2558 *additional_pins = limit;
2559}
2560
2561/*
2562 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2563 * avoid duplicating the tracing and relpersistence related logic.
2564 */
2565static BlockNumber
2567 ForkNumber fork,
2568 BufferAccessStrategy strategy,
2569 uint32 flags,
2570 uint32 extend_by,
2571 BlockNumber extend_upto,
2572 Buffer *buffers,
2573 uint32 *extended_by)
2574{
2575 BlockNumber first_block;
2576
2577 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2582 extend_by);
2583
2584 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2585 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2586 extend_by, extend_upto,
2587 buffers, &extend_by);
2588 else
2589 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2590 extend_by, extend_upto,
2591 buffers, &extend_by);
2592 *extended_by = extend_by;
2593
2594 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2599 *extended_by,
2600 first_block);
2601
2602 return first_block;
2603}
2604
2605/*
2606 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2607 * shared buffers.
2608 */
2609static BlockNumber
2611 ForkNumber fork,
2612 BufferAccessStrategy strategy,
2613 uint32 flags,
2614 uint32 extend_by,
2615 BlockNumber extend_upto,
2616 Buffer *buffers,
2617 uint32 *extended_by)
2618{
2619 BlockNumber first_block;
2620 IOContext io_context = IOContextForStrategy(strategy);
2621 instr_time io_start;
2622
2623 LimitAdditionalPins(&extend_by);
2624
2625 /*
2626 * Acquire victim buffers for extension without holding extension lock.
2627 * Writing out victim buffers is the most expensive part of extending the
2628 * relation, particularly when doing so requires WAL flushes. Zeroing out
2629 * the buffers is also quite expensive, so do that before holding the
2630 * extension lock as well.
2631 *
2632 * These pages are pinned by us and not valid. While we hold the pin they
2633 * can't be acquired as victim buffers by another backend.
2634 */
2635 for (uint32 i = 0; i < extend_by; i++)
2636 {
2637 Block buf_block;
2638
2639 buffers[i] = GetVictimBuffer(strategy, io_context);
2640 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2641
2642 /* new buffers are zero-filled */
2643 MemSet(buf_block, 0, BLCKSZ);
2644 }
2645
2646 /*
2647 * Lock relation against concurrent extensions, unless requested not to.
2648 *
2649 * We use the same extension lock for all forks. That's unnecessarily
2650 * restrictive, but currently extensions for forks don't happen often
2651 * enough to make it worth locking more granularly.
2652 *
2653 * Note that another backend might have extended the relation by the time
2654 * we get the lock.
2655 */
2656 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2658
2659 /*
2660 * If requested, invalidate size cache, so that smgrnblocks asks the
2661 * kernel.
2662 */
2663 if (flags & EB_CLEAR_SIZE_CACHE)
2665
2666 first_block = smgrnblocks(bmr.smgr, fork);
2667
2668 /*
2669 * Now that we have the accurate relation size, check if the caller wants
2670 * us to extend to only up to a specific size. If there were concurrent
2671 * extensions, we might have acquired too many buffers and need to release
2672 * them.
2673 */
2674 if (extend_upto != InvalidBlockNumber)
2675 {
2676 uint32 orig_extend_by = extend_by;
2677
2678 if (first_block > extend_upto)
2679 extend_by = 0;
2680 else if ((uint64) first_block + extend_by > extend_upto)
2681 extend_by = extend_upto - first_block;
2682
2683 for (uint32 i = extend_by; i < orig_extend_by; i++)
2684 {
2685 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2686
2687 /*
2688 * The victim buffer we acquired previously is clean and unused,
2689 * let it be found again quickly
2690 */
2691 StrategyFreeBuffer(buf_hdr);
2692 UnpinBuffer(buf_hdr);
2693 }
2694
2695 if (extend_by == 0)
2696 {
2697 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2699 *extended_by = extend_by;
2700 return first_block;
2701 }
2702 }
2703
2704 /* Fail if relation is already at maximum possible length */
2705 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2706 ereport(ERROR,
2707 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2708 errmsg("cannot extend relation %s beyond %u blocks",
2709 relpath(bmr.smgr->smgr_rlocator, fork).str,
2710 MaxBlockNumber)));
2711
2712 /*
2713 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2714 *
2715 * This needs to happen before we extend the relation, because as soon as
2716 * we do, other backends can start to read in those pages.
2717 */
2718 for (uint32 i = 0; i < extend_by; i++)
2719 {
2720 Buffer victim_buf = buffers[i];
2721 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2722 BufferTag tag;
2723 uint32 hash;
2724 LWLock *partition_lock;
2725 int existing_id;
2726
2727 /* in case we need to pin an existing buffer below */
2730
2731 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2732 hash = BufTableHashCode(&tag);
2733 partition_lock = BufMappingPartitionLock(hash);
2734
2735 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2736
2737 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2738
2739 /*
2740 * We get here only in the corner case where we are trying to extend
2741 * the relation but we found a pre-existing buffer. This can happen
2742 * because a prior attempt at extending the relation failed, and
2743 * because mdread doesn't complain about reads beyond EOF (when
2744 * zero_damaged_pages is ON) and so a previous attempt to read a block
2745 * beyond EOF could have left a "valid" zero-filled buffer.
2746 * Unfortunately, we have also seen this case occurring because of
2747 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2748 * that doesn't account for a recent write. In that situation, the
2749 * pre-existing buffer would contain valid data that we don't want to
2750 * overwrite. Since the legitimate cases should always have left a
2751 * zero-filled buffer, complain if not PageIsNew.
2752 */
2753 if (existing_id >= 0)
2754 {
2755 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2756 Block buf_block;
2757 bool valid;
2758
2759 /*
2760 * Pin the existing buffer before releasing the partition lock,
2761 * preventing it from being evicted.
2762 */
2763 valid = PinBuffer(existing_hdr, strategy);
2764
2765 LWLockRelease(partition_lock);
2766
2767 /*
2768 * The victim buffer we acquired previously is clean and unused,
2769 * let it be found again quickly
2770 */
2771 StrategyFreeBuffer(victim_buf_hdr);
2772 UnpinBuffer(victim_buf_hdr);
2773
2774 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2775 buf_block = BufHdrGetBlock(existing_hdr);
2776
2777 if (valid && !PageIsNew((Page) buf_block))
2778 ereport(ERROR,
2779 (errmsg("unexpected data beyond EOF in block %u of relation %s",
2780 existing_hdr->tag.blockNum,
2781 relpath(bmr.smgr->smgr_rlocator, fork).str),
2782 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2783
2784 /*
2785 * We *must* do smgr[zero]extend before succeeding, else the page
2786 * will not be reserved by the kernel, and the next P_NEW call
2787 * will decide to return the same page. Clear the BM_VALID bit,
2788 * do StartBufferIO() and proceed.
2789 *
2790 * Loop to handle the very small possibility that someone re-sets
2791 * BM_VALID between our clearing it and StartBufferIO inspecting
2792 * it.
2793 */
2794 do
2795 {
2796 uint32 buf_state = LockBufHdr(existing_hdr);
2797
2798 buf_state &= ~BM_VALID;
2799 UnlockBufHdr(existing_hdr, buf_state);
2800 } while (!StartBufferIO(existing_hdr, true, false));
2801 }
2802 else
2803 {
2804 uint32 buf_state;
2805
2806 buf_state = LockBufHdr(victim_buf_hdr);
2807
2808 /* some sanity checks while we hold the buffer header lock */
2809 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2810 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2811
2812 victim_buf_hdr->tag = tag;
2813
2814 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2815 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2816 buf_state |= BM_PERMANENT;
2817
2818 UnlockBufHdr(victim_buf_hdr, buf_state);
2819
2820 LWLockRelease(partition_lock);
2821
2822 /* XXX: could combine the locked operations in it with the above */
2823 StartBufferIO(victim_buf_hdr, true, false);
2824 }
2825 }
2826
2828
2829 /*
2830 * Note: if smgrzeroextend fails, we will end up with buffers that are
2831 * allocated but not marked BM_VALID. The next relation extension will
2832 * still select the same block number (because the relation didn't get any
2833 * longer on disk) and so future attempts to extend the relation will find
2834 * the same buffers (if they have not been recycled) but come right back
2835 * here to try smgrzeroextend again.
2836 *
2837 * We don't need to set checksum for all-zero pages.
2838 */
2839 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2840
2841 /*
2842 * Release the file-extension lock; it's now OK for someone else to extend
2843 * the relation some more.
2844 *
2845 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2846 * take noticeable time.
2847 */
2848 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2850
2852 io_start, 1, extend_by * BLCKSZ);
2853
2854 /* Set BM_VALID, terminate IO, and wake up any waiters */
2855 for (uint32 i = 0; i < extend_by; i++)
2856 {
2857 Buffer buf = buffers[i];
2858 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2859 bool lock = false;
2860
2861 if (flags & EB_LOCK_FIRST && i == 0)
2862 lock = true;
2863 else if (flags & EB_LOCK_TARGET)
2864 {
2865 Assert(extend_upto != InvalidBlockNumber);
2866 if (first_block + i + 1 == extend_upto)
2867 lock = true;
2868 }
2869
2870 if (lock)
2872
2873 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2874 }
2875
2877
2878 *extended_by = extend_by;
2879
2880 return first_block;
2881}
2882
2883/*
2884 * BufferIsExclusiveLocked
2885 *
2886 * Checks if buffer is exclusive-locked.
2887 *
2888 * Buffer must be pinned.
2889 */
2890bool
2892{
2893 BufferDesc *bufHdr;
2894
2896
2897 if (BufferIsLocal(buffer))
2898 {
2899 /* Content locks are not maintained for local buffers. */
2900 return true;
2901 }
2902 else
2903 {
2904 bufHdr = GetBufferDescriptor(buffer - 1);
2906 LW_EXCLUSIVE);
2907 }
2908}
2909
2910/*
2911 * BufferIsDirty
2912 *
2913 * Checks if buffer is already dirty.
2914 *
2915 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2916 * the result may be stale before it's returned.)
2917 */
2918bool
2920{
2921 BufferDesc *bufHdr;
2922
2924
2925 if (BufferIsLocal(buffer))
2926 {
2927 int bufid = -buffer - 1;
2928
2929 bufHdr = GetLocalBufferDescriptor(bufid);
2930 /* Content locks are not maintained for local buffers. */
2931 }
2932 else
2933 {
2934 bufHdr = GetBufferDescriptor(buffer - 1);
2936 LW_EXCLUSIVE));
2937 }
2938
2939 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2940}
2941
2942/*
2943 * MarkBufferDirty
2944 *
2945 * Marks buffer contents as dirty (actual write happens later).
2946 *
2947 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2948 * exclusive lock, then somebody could be in process of writing the buffer,
2949 * leading to risk of bad data written to disk.)
2950 */
2951void
2953{
2954 BufferDesc *bufHdr;
2955 uint32 buf_state;
2956 uint32 old_buf_state;
2957
2958 if (!BufferIsValid(buffer))
2959 elog(ERROR, "bad buffer ID: %d", buffer);
2960
2961 if (BufferIsLocal(buffer))
2962 {
2964 return;
2965 }
2966
2967 bufHdr = GetBufferDescriptor(buffer - 1);
2968
2971 LW_EXCLUSIVE));
2972
2973 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2974 for (;;)
2975 {
2976 if (old_buf_state & BM_LOCKED)
2977 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2978
2979 buf_state = old_buf_state;
2980
2981 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2982 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2983
2984 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2985 buf_state))
2986 break;
2987 }
2988
2989 /*
2990 * If the buffer was not dirty already, do vacuum accounting.
2991 */
2992 if (!(old_buf_state & BM_DIRTY))
2993 {
2995 if (VacuumCostActive)
2997 }
2998}
2999
3000/*
3001 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3002 *
3003 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3004 * compared to calling the two routines separately. Now it's mainly just
3005 * a convenience function. However, if the passed buffer is valid and
3006 * already contains the desired block, we just return it as-is; and that
3007 * does save considerable work compared to a full release and reacquire.
3008 *
3009 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3010 * buffer actually needs to be released. This case is the same as ReadBuffer,
3011 * but can save some tests in the caller.
3012 */
3013Buffer
3015 Relation relation,
3016 BlockNumber blockNum)
3017{
3018 ForkNumber forkNum = MAIN_FORKNUM;
3019 BufferDesc *bufHdr;
3020
3021 if (BufferIsValid(buffer))
3022 {
3024 if (BufferIsLocal(buffer))
3025 {
3026 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3027 if (bufHdr->tag.blockNum == blockNum &&
3028 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3029 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3030 return buffer;
3032 }
3033 else
3034 {
3035 bufHdr = GetBufferDescriptor(buffer - 1);
3036 /* we have pin, so it's ok to examine tag without spinlock */
3037 if (bufHdr->tag.blockNum == blockNum &&
3038 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3039 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3040 return buffer;
3041 UnpinBuffer(bufHdr);
3042 }
3043 }
3044
3045 return ReadBuffer(relation, blockNum);
3046}
3047
3048/*
3049 * PinBuffer -- make buffer unavailable for replacement.
3050 *
3051 * For the default access strategy, the buffer's usage_count is incremented
3052 * when we first pin it; for other strategies we just make sure the usage_count
3053 * isn't zero. (The idea of the latter is that we don't want synchronized
3054 * heap scans to inflate the count, but we need it to not be zero to discourage
3055 * other backends from stealing buffers from our ring. As long as we cycle
3056 * through the ring faster than the global clock-sweep cycles, buffers in
3057 * our ring won't be chosen as victims for replacement by other backends.)
3058 *
3059 * This should be applied only to shared buffers, never local ones.
3060 *
3061 * Since buffers are pinned/unpinned very frequently, pin buffers without
3062 * taking the buffer header lock; instead update the state variable in loop of
3063 * CAS operations. Hopefully it's just a single CAS.
3064 *
3065 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3066 * must have been done already.
3067 *
3068 * Returns true if buffer is BM_VALID, else false. This provision allows
3069 * some callers to avoid an extra spinlock cycle.
3070 */
3071static bool
3073{
3075 bool result;
3077
3080
3081 ref = GetPrivateRefCountEntry(b, true);
3082
3083 if (ref == NULL)
3084 {
3085 uint32 buf_state;
3086 uint32 old_buf_state;
3087
3089
3090 old_buf_state = pg_atomic_read_u32(&buf->state);
3091 for (;;)
3092 {
3093 if (old_buf_state & BM_LOCKED)
3094 old_buf_state = WaitBufHdrUnlocked(buf);
3095
3096 buf_state = old_buf_state;
3097
3098 /* increase refcount */
3099 buf_state += BUF_REFCOUNT_ONE;
3100
3101 if (strategy == NULL)
3102 {
3103 /* Default case: increase usagecount unless already max. */
3105 buf_state += BUF_USAGECOUNT_ONE;
3106 }
3107 else
3108 {
3109 /*
3110 * Ring buffers shouldn't evict others from pool. Thus we
3111 * don't make usagecount more than 1.
3112 */
3113 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3114 buf_state += BUF_USAGECOUNT_ONE;
3115 }
3116
3117 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3118 buf_state))
3119 {
3120 result = (buf_state & BM_VALID) != 0;
3121
3122 /*
3123 * Assume that we acquired a buffer pin for the purposes of
3124 * Valgrind buffer client checks (even in !result case) to
3125 * keep things simple. Buffers that are unsafe to access are
3126 * not generally guaranteed to be marked undefined or
3127 * non-accessible in any case.
3128 */
3130 break;
3131 }
3132 }
3133 }
3134 else
3135 {
3136 /*
3137 * If we previously pinned the buffer, it is likely to be valid, but
3138 * it may not be if StartReadBuffers() was called and
3139 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3140 * the flags without locking. This is racy, but it's OK to return
3141 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3142 * it'll see that it's now valid.
3143 *
3144 * Note: We deliberately avoid a Valgrind client request here.
3145 * Individual access methods can optionally superimpose buffer page
3146 * client requests on top of our client requests to enforce that
3147 * buffers are only accessed while locked (and pinned). It's possible
3148 * that the buffer page is legitimately non-accessible here. We
3149 * cannot meddle with that.
3150 */
3151 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3152 }
3153
3154 ref->refcount++;
3155 Assert(ref->refcount > 0);
3157 return result;
3158}
3159
3160/*
3161 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3162 * The spinlock is released before return.
3163 *
3164 * As this function is called with the spinlock held, the caller has to
3165 * previously call ReservePrivateRefCountEntry() and
3166 * ResourceOwnerEnlarge(CurrentResourceOwner);
3167 *
3168 * Currently, no callers of this function want to modify the buffer's
3169 * usage_count at all, so there's no need for a strategy parameter.
3170 * Also we don't bother with a BM_VALID test (the caller could check that for
3171 * itself).
3172 *
3173 * Also all callers only ever use this function when it's known that the
3174 * buffer can't have a preexisting pin by this backend. That allows us to skip
3175 * searching the private refcount array & hash, which is a boon, because the
3176 * spinlock is still held.
3177 *
3178 * Note: use of this routine is frequently mandatory, not just an optimization
3179 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3180 * its state can change under us.
3181 */
3182static void
3184{
3185 Buffer b;
3187 uint32 buf_state;
3188
3189 /*
3190 * As explained, We don't expect any preexisting pins. That allows us to
3191 * manipulate the PrivateRefCount after releasing the spinlock
3192 */
3194
3195 /*
3196 * Buffer can't have a preexisting pin, so mark its page as defined to
3197 * Valgrind (this is similar to the PinBuffer() case where the backend
3198 * doesn't already have a buffer pin)
3199 */
3201
3202 /*
3203 * Since we hold the buffer spinlock, we can update the buffer state and
3204 * release the lock in one operation.
3205 */
3206 buf_state = pg_atomic_read_u32(&buf->state);
3207 Assert(buf_state & BM_LOCKED);
3208 buf_state += BUF_REFCOUNT_ONE;
3209 UnlockBufHdr(buf, buf_state);
3210
3212
3214 ref->refcount++;
3215
3217}
3218
3219/*
3220 * Support for waking up another backend that is waiting for the cleanup lock
3221 * to be released using BM_PIN_COUNT_WAITER.
3222 *
3223 * See LockBufferForCleanup().
3224 *
3225 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3226 * not just reducing the backend-local pincount for the buffer).
3227 */
3228static void
3230{
3231 /*
3232 * Acquire the buffer header lock, re-check that there's a waiter. Another
3233 * backend could have unpinned this buffer, and already woken up the
3234 * waiter.
3235 *
3236 * There's no danger of the buffer being replaced after we unpinned it
3237 * above, as it's pinned by the waiter. The waiter removes
3238 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3239 * backend waking it up.
3240 */
3241 uint32 buf_state = LockBufHdr(buf);
3242
3243 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3244 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3245 {
3246 /* we just released the last pin other than the waiter's */
3247 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3248
3249 buf_state &= ~BM_PIN_COUNT_WAITER;
3250 UnlockBufHdr(buf, buf_state);
3251 ProcSendSignal(wait_backend_pgprocno);
3252 }
3253 else
3254 UnlockBufHdr(buf, buf_state);
3255}
3256
3257/*
3258 * UnpinBuffer -- make buffer available for replacement.
3259 *
3260 * This should be applied only to shared buffers, never local ones. This
3261 * always adjusts CurrentResourceOwner.
3262 */
3263static void
3265{
3267
3270}
3271
3272static void
3274{
3277
3279
3280 /* not moving as we're likely deleting it soon anyway */
3281 ref = GetPrivateRefCountEntry(b, false);
3282 Assert(ref != NULL);
3283 Assert(ref->refcount > 0);
3284 ref->refcount--;
3285 if (ref->refcount == 0)
3286 {
3287 uint32 buf_state;
3288 uint32 old_buf_state;
3289
3290 /*
3291 * Mark buffer non-accessible to Valgrind.
3292 *
3293 * Note that the buffer may have already been marked non-accessible
3294 * within access method code that enforces that buffers are only
3295 * accessed while a buffer lock is held.
3296 */
3298
3299 /* I'd better not still hold the buffer content lock */
3301
3302 /*
3303 * Decrement the shared reference count.
3304 *
3305 * Since buffer spinlock holder can update status using just write,
3306 * it's not safe to use atomic decrement here; thus use a CAS loop.
3307 */
3308 old_buf_state = pg_atomic_read_u32(&buf->state);
3309 for (;;)
3310 {
3311 if (old_buf_state & BM_LOCKED)
3312 old_buf_state = WaitBufHdrUnlocked(buf);
3313
3314 buf_state = old_buf_state;
3315
3316 buf_state -= BUF_REFCOUNT_ONE;
3317
3318 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3319 buf_state))
3320 break;
3321 }
3322
3323 /* Support LockBufferForCleanup() */
3324 if (buf_state & BM_PIN_COUNT_WAITER)
3326
3328 }
3329}
3330
3331#define ST_SORT sort_checkpoint_bufferids
3332#define ST_ELEMENT_TYPE CkptSortItem
3333#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3334#define ST_SCOPE static
3335#define ST_DEFINE
3336#include "lib/sort_template.h"
3337
3338/*
3339 * BufferSync -- Write out all dirty buffers in the pool.
3340 *
3341 * This is called at checkpoint time to write out all dirty shared buffers.
3342 * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
3343 * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3344 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
3345 * unlogged buffers, which are otherwise skipped. The remaining flags
3346 * currently have no effect here.
3347 */
3348static void
3349BufferSync(int flags)
3350{
3351 uint32 buf_state;
3352 int buf_id;
3353 int num_to_scan;
3354 int num_spaces;
3355 int num_processed;
3356 int num_written;
3357 CkptTsStatus *per_ts_stat = NULL;
3358 Oid last_tsid;
3359 binaryheap *ts_heap;
3360 int i;
3361 int mask = BM_DIRTY;
3362 WritebackContext wb_context;
3363
3364 /*
3365 * Unless this is a shutdown checkpoint or we have been explicitly told,
3366 * we write only permanent, dirty buffers. But at shutdown or end of
3367 * recovery, we write all dirty buffers.
3368 */
3371 mask |= BM_PERMANENT;
3372
3373 /*
3374 * Loop over all buffers, and mark the ones that need to be written with
3375 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3376 * can estimate how much work needs to be done.
3377 *
3378 * This allows us to write only those pages that were dirty when the
3379 * checkpoint began, and not those that get dirtied while it proceeds.
3380 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3381 * later in this function, or by normal backends or the bgwriter cleaning
3382 * scan, the flag is cleared. Any buffer dirtied after this point won't
3383 * have the flag set.
3384 *
3385 * Note that if we fail to write some buffer, we may leave buffers with
3386 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3387 * certainly need to be written for the next checkpoint attempt, too.
3388 */
3389 num_to_scan = 0;
3390 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3391 {
3392 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3393
3394 /*
3395 * Header spinlock is enough to examine BM_DIRTY, see comment in
3396 * SyncOneBuffer.
3397 */
3398 buf_state = LockBufHdr(bufHdr);
3399
3400 if ((buf_state & mask) == mask)
3401 {
3402 CkptSortItem *item;
3403
3404 buf_state |= BM_CHECKPOINT_NEEDED;
3405
3406 item = &CkptBufferIds[num_to_scan++];
3407 item->buf_id = buf_id;
3408 item->tsId = bufHdr->tag.spcOid;
3409 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3410 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3411 item->blockNum = bufHdr->tag.blockNum;
3412 }
3413
3414 UnlockBufHdr(bufHdr, buf_state);
3415
3416 /* Check for barrier events in case NBuffers is large. */
3419 }
3420
3421 if (num_to_scan == 0)
3422 return; /* nothing to do */
3423
3425
3426 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3427
3428 /*
3429 * Sort buffers that need to be written to reduce the likelihood of random
3430 * IO. The sorting is also important for the implementation of balancing
3431 * writes between tablespaces. Without balancing writes we'd potentially
3432 * end up writing to the tablespaces one-by-one; possibly overloading the
3433 * underlying system.
3434 */
3435 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3436
3437 num_spaces = 0;
3438
3439 /*
3440 * Allocate progress status for each tablespace with buffers that need to
3441 * be flushed. This requires the to-be-flushed array to be sorted.
3442 */
3443 last_tsid = InvalidOid;
3444 for (i = 0; i < num_to_scan; i++)
3445 {
3446 CkptTsStatus *s;
3447 Oid cur_tsid;
3448
3449 cur_tsid = CkptBufferIds[i].tsId;
3450
3451 /*
3452 * Grow array of per-tablespace status structs, every time a new
3453 * tablespace is found.
3454 */
3455 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3456 {
3457 Size sz;
3458
3459 num_spaces++;
3460
3461 /*
3462 * Not worth adding grow-by-power-of-2 logic here - even with a
3463 * few hundred tablespaces this should be fine.
3464 */
3465 sz = sizeof(CkptTsStatus) * num_spaces;
3466
3467 if (per_ts_stat == NULL)
3468 per_ts_stat = (CkptTsStatus *) palloc(sz);
3469 else
3470 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3471
3472 s = &per_ts_stat[num_spaces - 1];
3473 memset(s, 0, sizeof(*s));
3474 s->tsId = cur_tsid;
3475
3476 /*
3477 * The first buffer in this tablespace. As CkptBufferIds is sorted
3478 * by tablespace all (s->num_to_scan) buffers in this tablespace
3479 * will follow afterwards.
3480 */
3481 s->index = i;
3482
3483 /*
3484 * progress_slice will be determined once we know how many buffers
3485 * are in each tablespace, i.e. after this loop.
3486 */
3487
3488 last_tsid = cur_tsid;
3489 }
3490 else
3491 {
3492 s = &per_ts_stat[num_spaces - 1];
3493 }
3494
3495 s->num_to_scan++;
3496
3497 /* Check for barrier events. */
3500 }
3501
3502 Assert(num_spaces > 0);
3503
3504 /*
3505 * Build a min-heap over the write-progress in the individual tablespaces,
3506 * and compute how large a portion of the total progress a single
3507 * processed buffer is.
3508 */
3509 ts_heap = binaryheap_allocate(num_spaces,
3511 NULL);
3512
3513 for (i = 0; i < num_spaces; i++)
3514 {
3515 CkptTsStatus *ts_stat = &per_ts_stat[i];
3516
3517 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3518
3519 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3520 }
3521
3522 binaryheap_build(ts_heap);
3523
3524 /*
3525 * Iterate through to-be-checkpointed buffers and write the ones (still)
3526 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3527 * tablespaces; otherwise the sorting would lead to only one tablespace
3528 * receiving writes at a time, making inefficient use of the hardware.
3529 */
3530 num_processed = 0;
3531 num_written = 0;
3532 while (!binaryheap_empty(ts_heap))
3533 {
3534 BufferDesc *bufHdr = NULL;
3535 CkptTsStatus *ts_stat = (CkptTsStatus *)
3537
3538 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3539 Assert(buf_id != -1);
3540
3541 bufHdr = GetBufferDescriptor(buf_id);
3542
3543 num_processed++;
3544
3545 /*
3546 * We don't need to acquire the lock here, because we're only looking
3547 * at a single bit. It's possible that someone else writes the buffer
3548 * and clears the flag right after we check, but that doesn't matter
3549 * since SyncOneBuffer will then do nothing. However, there is a
3550 * further race condition: it's conceivable that between the time we
3551 * examine the bit here and the time SyncOneBuffer acquires the lock,
3552 * someone else not only wrote the buffer but replaced it with another
3553 * page and dirtied it. In that improbable case, SyncOneBuffer will
3554 * write the buffer though we didn't need to. It doesn't seem worth
3555 * guarding against this, though.
3556 */
3558 {
3559 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3560 {
3561 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3563 num_written++;
3564 }
3565 }
3566
3567 /*
3568 * Measure progress independent of actually having to flush the buffer
3569 * - otherwise writing become unbalanced.
3570 */
3571 ts_stat->progress += ts_stat->progress_slice;
3572 ts_stat->num_scanned++;
3573 ts_stat->index++;
3574
3575 /* Have all the buffers from the tablespace been processed? */
3576 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3577 {
3578 binaryheap_remove_first(ts_heap);
3579 }
3580 else
3581 {
3582 /* update heap with the new progress */
3583 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3584 }
3585
3586 /*
3587 * Sleep to throttle our I/O rate.
3588 *
3589 * (This will check for barrier events even if it doesn't sleep.)
3590 */
3591 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3592 }
3593
3594 /*
3595 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3596 * IOContext will always be IOCONTEXT_NORMAL.
3597 */
3599
3600 pfree(per_ts_stat);
3601 per_ts_stat = NULL;
3602 binaryheap_free(ts_heap);
3603
3604 /*
3605 * Update checkpoint statistics. As noted above, this doesn't include
3606 * buffers written by other backends or bgwriter scan.
3607 */
3608 CheckpointStats.ckpt_bufs_written += num_written;
3609
3610 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3611}
3612
3613/*
3614 * BgBufferSync -- Write out some dirty buffers in the pool.
3615 *
3616 * This is called periodically by the background writer process.
3617 *
3618 * Returns true if it's appropriate for the bgwriter process to go into
3619 * low-power hibernation mode. (This happens if the strategy clock sweep
3620 * has been "lapped" and no buffer allocations have occurred recently,
3621 * or if the bgwriter has been effectively disabled by setting
3622 * bgwriter_lru_maxpages to 0.)
3623 */
3624bool
3626{
3627 /* info obtained from freelist.c */
3628 int strategy_buf_id;
3629 uint32 strategy_passes;
3630 uint32 recent_alloc;
3631
3632 /*
3633 * Information saved between calls so we can determine the strategy
3634 * point's advance rate and avoid scanning already-cleaned buffers.
3635 */
3636 static bool saved_info_valid = false;
3637 static int prev_strategy_buf_id;
3638 static uint32 prev_strategy_passes;
3639 static int next_to_clean;
3640 static uint32 next_passes;
3641
3642 /* Moving averages of allocation rate and clean-buffer density */
3643 static float smoothed_alloc = 0;
3644 static float smoothed_density = 10.0;
3645
3646 /* Potentially these could be tunables, but for now, not */
3647 float smoothing_samples = 16;
3648 float scan_whole_pool_milliseconds = 120000.0;
3649
3650 /* Used to compute how far we scan ahead */
3651 long strategy_delta;
3652 int bufs_to_lap;
3653 int bufs_ahead;
3654 float scans_per_alloc;
3655 int reusable_buffers_est;
3656 int upcoming_alloc_est;
3657 int min_scan_buffers;
3658
3659 /* Variables for the scanning loop proper */
3660 int num_to_scan;
3661 int num_written;
3662 int reusable_buffers;
3663
3664 /* Variables for final smoothed_density update */
3665 long new_strategy_delta;
3666 uint32 new_recent_alloc;
3667
3668 /*
3669 * Find out where the freelist clock sweep currently is, and how many
3670 * buffer allocations have happened since our last call.
3671 */
3672 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3673
3674 /* Report buffer alloc counts to pgstat */
3675 PendingBgWriterStats.buf_alloc += recent_alloc;
3676
3677 /*
3678 * If we're not running the LRU scan, just stop after doing the stats
3679 * stuff. We mark the saved state invalid so that we can recover sanely
3680 * if LRU scan is turned back on later.
3681 */
3682 if (bgwriter_lru_maxpages <= 0)
3683 {
3684 saved_info_valid = false;
3685 return true;
3686 }
3687
3688 /*
3689 * Compute strategy_delta = how many buffers have been scanned by the
3690 * clock sweep since last time. If first time through, assume none. Then
3691 * see if we are still ahead of the clock sweep, and if so, how many
3692 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3693 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3694 * behavior when the passes counts wrap around.
3695 */
3696 if (saved_info_valid)
3697 {
3698 int32 passes_delta = strategy_passes - prev_strategy_passes;
3699
3700 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3701 strategy_delta += (long) passes_delta * NBuffers;
3702
3703 Assert(strategy_delta >= 0);
3704
3705 if ((int32) (next_passes - strategy_passes) > 0)
3706 {
3707 /* we're one pass ahead of the strategy point */
3708 bufs_to_lap = strategy_buf_id - next_to_clean;
3709#ifdef BGW_DEBUG
3710 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3711 next_passes, next_to_clean,
3712 strategy_passes, strategy_buf_id,
3713 strategy_delta, bufs_to_lap);
3714#endif
3715 }
3716 else if (next_passes == strategy_passes &&
3717 next_to_clean >= strategy_buf_id)
3718 {
3719 /* on same pass, but ahead or at least not behind */
3720 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3721#ifdef BGW_DEBUG
3722 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3723 next_passes, next_to_clean,
3724 strategy_passes, strategy_buf_id,
3725 strategy_delta, bufs_to_lap);
3726#endif
3727 }
3728 else
3729 {
3730 /*
3731 * We're behind, so skip forward to the strategy point and start
3732 * cleaning from there.
3733 */
3734#ifdef BGW_DEBUG
3735 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3736 next_passes, next_to_clean,
3737 strategy_passes, strategy_buf_id,
3738 strategy_delta);
3739#endif
3740 next_to_clean = strategy_buf_id;
3741 next_passes = strategy_passes;
3742 bufs_to_lap = NBuffers;
3743 }
3744 }
3745 else
3746 {
3747 /*
3748 * Initializing at startup or after LRU scanning had been off. Always
3749 * start at the strategy point.
3750 */
3751#ifdef BGW_DEBUG
3752 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3753 strategy_passes, strategy_buf_id);
3754#endif
3755 strategy_delta = 0;
3756 next_to_clean = strategy_buf_id;
3757 next_passes = strategy_passes;
3758 bufs_to_lap = NBuffers;
3759 }
3760
3761 /* Update saved info for next time */
3762 prev_strategy_buf_id = strategy_buf_id;
3763 prev_strategy_passes = strategy_passes;
3764 saved_info_valid = true;
3765
3766 /*
3767 * Compute how many buffers had to be scanned for each new allocation, ie,
3768 * 1/density of reusable buffers, and track a moving average of that.
3769 *
3770 * If the strategy point didn't move, we don't update the density estimate
3771 */
3772 if (strategy_delta > 0 && recent_alloc > 0)
3773 {
3774 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3775 smoothed_density += (scans_per_alloc - smoothed_density) /
3776 smoothing_samples;
3777 }
3778
3779 /*
3780 * Estimate how many reusable buffers there are between the current
3781 * strategy point and where we've scanned ahead to, based on the smoothed
3782 * density estimate.
3783 */
3784 bufs_ahead = NBuffers - bufs_to_lap;
3785 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3786
3787 /*
3788 * Track a moving average of recent buffer allocations. Here, rather than
3789 * a true average we want a fast-attack, slow-decline behavior: we
3790 * immediately follow any increase.
3791 */
3792 if (smoothed_alloc <= (float) recent_alloc)
3793 smoothed_alloc = recent_alloc;
3794 else
3795 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3796 smoothing_samples;
3797
3798 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3799 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3800
3801 /*
3802 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3803 * eventually underflow to zero, and the underflows produce annoying
3804 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3805 * zero, there's no point in tracking smaller and smaller values of
3806 * smoothed_alloc, so just reset it to exactly zero to avoid this
3807 * syndrome. It will pop back up as soon as recent_alloc increases.
3808 */
3809 if (upcoming_alloc_est == 0)
3810 smoothed_alloc = 0;
3811
3812 /*
3813 * Even in cases where there's been little or no buffer allocation
3814 * activity, we want to make a small amount of progress through the buffer
3815 * cache so that as many reusable buffers as possible are clean after an
3816 * idle period.
3817 *
3818 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3819 * the BGW will be called during the scan_whole_pool time; slice the
3820 * buffer pool into that many sections.
3821 */
3822 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3823
3824 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3825 {
3826#ifdef BGW_DEBUG
3827 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3828 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3829#endif
3830 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3831 }
3832
3833 /*
3834 * Now write out dirty reusable buffers, working forward from the
3835 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3836 * enough buffers to match our estimate of the next cycle's allocation
3837 * requirements, or hit the bgwriter_lru_maxpages limit.
3838 */
3839
3840 num_to_scan = bufs_to_lap;
3841 num_written = 0;
3842 reusable_buffers = reusable_buffers_est;
3843
3844 /* Execute the LRU scan */
3845 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3846 {
3847 int sync_state = SyncOneBuffer(next_to_clean, true,
3848 wb_context);
3849
3850 if (++next_to_clean >= NBuffers)
3851 {
3852 next_to_clean = 0;
3853 next_passes++;
3854 }
3855 num_to_scan--;
3856
3857 if (sync_state & BUF_WRITTEN)
3858 {
3859 reusable_buffers++;
3860 if (++num_written >= bgwriter_lru_maxpages)
3861 {
3863 break;
3864 }
3865 }
3866 else if (sync_state & BUF_REUSABLE)
3867 reusable_buffers++;
3868 }
3869
3871
3872#ifdef BGW_DEBUG
3873 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3874 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3875 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3876 bufs_to_lap - num_to_scan,
3877 num_written,
3878 reusable_buffers - reusable_buffers_est);
3879#endif
3880
3881 /*
3882 * Consider the above scan as being like a new allocation scan.
3883 * Characterize its density and update the smoothed one based on it. This
3884 * effectively halves the moving average period in cases where both the
3885 * strategy and the background writer are doing some useful scanning,
3886 * which is helpful because a long memory isn't as desirable on the
3887 * density estimates.
3888 */
3889 new_strategy_delta = bufs_to_lap - num_to_scan;
3890 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3891 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3892 {
3893 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3894 smoothed_density += (scans_per_alloc - smoothed_density) /
3895 smoothing_samples;
3896
3897#ifdef BGW_DEBUG
3898 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3899 new_recent_alloc, new_strategy_delta,
3900 scans_per_alloc, smoothed_density);
3901#endif
3902 }
3903
3904 /* Return true if OK to hibernate */
3905 return (bufs_to_lap == 0 && recent_alloc == 0);
3906}
3907
3908/*
3909 * SyncOneBuffer -- process a single buffer during syncing.
3910 *
3911 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3912 * buffers marked recently used, as these are not replacement candidates.
3913 *
3914 * Returns a bitmask containing the following flag bits:
3915 * BUF_WRITTEN: we wrote the buffer.
3916 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3917 * pin count 0 and usage count 0.
3918 *
3919 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3920 * after locking it, but we don't care all that much.)
3921 */
3922static int
3923SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3924{
3925 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3926 int result = 0;
3927 uint32 buf_state;
3928 BufferTag tag;
3929
3930 /* Make sure we can handle the pin */
3933
3934 /*
3935 * Check whether buffer needs writing.
3936 *
3937 * We can make this check without taking the buffer content lock so long
3938 * as we mark pages dirty in access methods *before* logging changes with
3939 * XLogInsert(): if someone marks the buffer dirty just after our check we
3940 * don't worry because our checkpoint.redo points before log record for
3941 * upcoming changes and so we are not required to write such dirty buffer.
3942 */
3943 buf_state = LockBufHdr(bufHdr);
3944
3945 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3946 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3947 {
3948 result |= BUF_REUSABLE;
3949 }
3950 else if (skip_recently_used)
3951 {
3952 /* Caller told us not to write recently-used buffers */
3953 UnlockBufHdr(bufHdr, buf_state);
3954 return result;
3955 }
3956
3957 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3958 {
3959 /* It's clean, so nothing to do */
3960 UnlockBufHdr(bufHdr, buf_state);
3961 return result;
3962 }
3963
3964 /*
3965 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3966 * buffer is clean by the time we've locked it.)
3967 */
3968 PinBuffer_Locked(bufHdr);
3970
3972
3974
3975 tag = bufHdr->tag;
3976
3977 UnpinBuffer(bufHdr);
3978
3979 /*
3980 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3981 * IOContext will always be IOCONTEXT_NORMAL.
3982 */
3984
3985 return result | BUF_WRITTEN;
3986}
3987
3988/*
3989 * AtEOXact_Buffers - clean up at end of transaction.
3990 *
3991 * As of PostgreSQL 8.0, buffer pins should get released by the
3992 * ResourceOwner mechanism. This routine is just a debugging
3993 * cross-check that no pins remain.
3994 */
3995void
3996AtEOXact_Buffers(bool isCommit)
3997{
3999
4000 AtEOXact_LocalBuffers(isCommit);
4001
4003}
4004
4005/*
4006 * Initialize access to shared buffer pool
4007 *
4008 * This is called during backend startup (whether standalone or under the
4009 * postmaster). It sets up for this backend's access to the already-existing
4010 * buffer pool.
4011 */
4012void
4014{
4015 HASHCTL hash_ctl;
4016
4017 /*
4018 * An advisory limit on the number of pins each backend should hold, based
4019 * on shared_buffers and the maximum number of connections possible.
4020 * That's very pessimistic, but outside toy-sized shared_buffers it should
4021 * allow plenty of pins. LimitAdditionalPins() and
4022 * GetAdditionalPinLimit() can be used to check the remaining balance.
4023 */
4025
4026 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4027
4028 hash_ctl.keysize = sizeof(int32);
4029 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4030
4031 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4033
4034 /*
4035 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4036 * the corresponding phase of backend shutdown.
4037 */
4038 Assert(MyProc != NULL);
4040}
4041
4042/*
4043 * During backend exit, ensure that we released all shared-buffer locks and
4044 * assert that we have no remaining pins.
4045 */
4046static void
4048{
4049 UnlockBuffers();
4050
4052
4053 /* localbuf.c needs a chance too */
4055}
4056
4057/*
4058 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4059 *
4060 * As of PostgreSQL 8.0, buffer pins should get released by the
4061 * ResourceOwner mechanism. This routine is just a debugging
4062 * cross-check that no pins remain.
4063 */
4064static void
4066{
4067#ifdef USE_ASSERT_CHECKING
4068 int RefCountErrors = 0;
4070 int i;
4071 char *s;
4072
4073 /* check the array */
4074 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4075 {
4076 res = &PrivateRefCountArray[i];
4077
4078 if (res->buffer != InvalidBuffer)
4079 {
4081 elog(WARNING, "buffer refcount leak: %s", s);
4082 pfree(s);
4083
4084 RefCountErrors++;
4085 }
4086 }
4087
4088 /* if necessary search the hash */
4090 {
4091 HASH_SEQ_STATUS hstat;
4092
4094 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4095 {
4097 elog(WARNING, "buffer refcount leak: %s", s);
4098 pfree(s);
4099 RefCountErrors++;
4100 }
4101 }
4102
4103 Assert(RefCountErrors == 0);
4104#endif
4105}
4106
4107#ifdef USE_ASSERT_CHECKING
4108/*
4109 * Check for exclusive-locked catalog buffers. This is the core of
4110 * AssertCouldGetRelation().
4111 *
4112 * A backend would self-deadlock on LWLocks if the catalog scan read the
4113 * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4114 * catalogs used in relcache, because a catcache search on any catalog may
4115 * build that catalog's relcache entry. We don't have an inventory of
4116 * catalogs relcache uses, so just check buffers of most catalogs.
4117 *
4118 * It's better to minimize waits while holding an exclusive buffer lock, so it
4119 * would be nice to broaden this check not to be catalog-specific. However,
4120 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4121 * read tables. That is deadlock-free as long as there's no loop in the
4122 * dependency graph: modifying table A may cause an opclass to read table B,
4123 * but it must not cause a read of table A.
4124 */
4125void
4126AssertBufferLocksPermitCatalogRead(void)
4127{
4128 ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4129}
4130
4131static void
4132AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4133 void *unused_context)
4134{
4135 BufferDesc *bufHdr;
4136 BufferTag tag;
4137 Oid relid;
4138
4139 if (mode != LW_EXCLUSIVE)
4140 return;
4141
4142 if (!((BufferDescPadded *) lock > BufferDescriptors &&
4144 return; /* not a buffer lock */
4145
4146 bufHdr = (BufferDesc *)
4147 ((char *) lock - offsetof(BufferDesc, content_lock));
4148 tag = bufHdr->tag;
4149
4150 /*
4151 * This relNumber==relid assumption holds until a catalog experiences
4152 * VACUUM FULL or similar. After a command like that, relNumber will be
4153 * in the normal (non-catalog) range, and we lose the ability to detect
4154 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4155 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4156 * held lock.
4157 */
4158 relid = tag.relNumber;
4159
4160 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4161 return;
4162
4164 /* Shared rels are always catalogs: detect even after VACUUM FULL. */
4165 Assert(tag.spcOid != GLOBALTABLESPACE_OID);
4166}
4167#endif
4168
4169
4170/*
4171 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4172 */
4173char *
4175{
4176 BufferDesc *buf;
4177 int32 loccount;
4178 char *result;
4179 ProcNumber backend;
4180 uint32 buf_state;
4181
4183 if (BufferIsLocal(buffer))
4184 {
4186 loccount = LocalRefCount[-buffer - 1];
4187 backend = MyProcNumber;
4188 }
4189 else
4190 {
4192 loccount = GetPrivateRefCount(buffer);
4193 backend = INVALID_PROC_NUMBER;
4194 }
4195
4196 /* theoretically we should lock the bufhdr here */
4197 buf_state = pg_atomic_read_u32(&buf->state);
4198
4199 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4200 buffer,
4202 BufTagGetForkNum(&buf->tag)).str,
4203 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4204 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4205 return result;
4206}
4207
4208/*
4209 * CheckPointBuffers
4210 *
4211 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4212 *
4213 * Note: temporary relations do not participate in checkpoints, so they don't
4214 * need to be flushed.
4215 */
4216void
4218{
4219 BufferSync(flags);
4220}
4221
4222/*
4223 * BufferGetBlockNumber
4224 * Returns the block number associated with a buffer.
4225 *
4226 * Note:
4227 * Assumes that the buffer is valid and pinned, else the
4228 * value may be obsolete immediately...
4229 */
4232{
4233 BufferDesc *bufHdr;
4234
4236
4237 if (BufferIsLocal(buffer))
4238 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4239 else
4240 bufHdr = GetBufferDescriptor(buffer - 1);
4241
4242 /* pinned, so OK to read tag without spinlock */
4243 return bufHdr->tag.blockNum;
4244}
4245
4246/*
4247 * BufferGetTag
4248 * Returns the relfilelocator, fork number and block number associated with
4249 * a buffer.
4250 */
4251void
4253 BlockNumber *blknum)
4254{
4255 BufferDesc *bufHdr;
4256
4257 /* Do the same checks as BufferGetBlockNumber. */
4259
4260 if (BufferIsLocal(buffer))
4261 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4262 else
4263 bufHdr = GetBufferDescriptor(buffer - 1);
4264
4265 /* pinned, so OK to read tag without spinlock */
4266 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4267 *forknum = BufTagGetForkNum(&bufHdr->tag);
4268 *blknum = bufHdr->tag.blockNum;
4269}
4270
4271/*
4272 * FlushBuffer
4273 * Physically write out a shared buffer.
4274 *
4275 * NOTE: this actually just passes the buffer contents to the kernel; the
4276 * real write to disk won't happen until the kernel feels like it. This
4277 * is okay from our point of view since we can redo the changes from WAL.
4278 * However, we will need to force the changes to disk via fsync before
4279 * we can checkpoint WAL.
4280 *
4281 * The caller must hold a pin on the buffer and have share-locked the
4282 * buffer contents. (Note: a share-lock does not prevent updates of
4283 * hint bits in the buffer, so the page could change while the write
4284 * is in progress, but we assume that that will not invalidate the data
4285 * written.)
4286 *
4287 * If the caller has an smgr reference for the buffer's relation, pass it
4288 * as the second parameter. If not, pass NULL.
4289 */
4290static void
4292 IOContext io_context)
4293{
4294 XLogRecPtr recptr;
4295 ErrorContextCallback errcallback;
4296 instr_time io_start;
4297 Block bufBlock;
4298 char *bufToWrite;
4299 uint32 buf_state;
4300
4301 /*
4302 * Try to start an I/O operation. If StartBufferIO returns false, then
4303 * someone else flushed the buffer before we could, so we need not do
4304 * anything.
4305 */
4306 if (!StartBufferIO(buf, false, false))
4307 return;
4308
4309 /* Setup error traceback support for ereport() */
4311 errcallback.arg = buf;
4312 errcallback.previous = error_context_stack;
4313 error_context_stack = &errcallback;
4314
4315 /* Find smgr relation for buffer */
4316 if (reln == NULL)
4318
4319 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4320 buf->tag.blockNum,
4324
4325 buf_state = LockBufHdr(buf);
4326
4327 /*
4328 * Run PageGetLSN while holding header lock, since we don't have the
4329 * buffer locked exclusively in all cases.
4330 */
4331 recptr = BufferGetLSN(buf);
4332
4333 /* To check if block content changes while flushing. - vadim 01/17/97 */
4334 buf_state &= ~BM_JUST_DIRTIED;
4335 UnlockBufHdr(buf, buf_state);
4336
4337 /*
4338 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4339 * rule that log updates must hit disk before any of the data-file changes
4340 * they describe do.
4341 *
4342 * However, this rule does not apply to unlogged relations, which will be
4343 * lost after a crash anyway. Most unlogged relation pages do not bear
4344 * LSNs since we never emit WAL records for them, and therefore flushing
4345 * up through the buffer LSN would be useless, but harmless. However,
4346 * GiST indexes use LSNs internally to track page-splits, and therefore
4347 * unlogged GiST pages bear "fake" LSNs generated by
4348 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4349 * LSN counter could advance past the WAL insertion point; and if it did
4350 * happen, attempting to flush WAL through that location would fail, with
4351 * disastrous system-wide consequences. To make sure that can't happen,
4352 * skip the flush if the buffer isn't permanent.
4353 */
4354 if (buf_state & BM_PERMANENT)
4355 XLogFlush(recptr);
4356
4357 /*
4358 * Now it's safe to write the buffer to disk. Note that no one else should
4359 * have been able to write it, while we were busy with log flushing,
4360 * because we got the exclusive right to perform I/O by setting the
4361 * BM_IO_IN_PROGRESS bit.
4362 */
4363 bufBlock = BufHdrGetBlock(buf);
4364
4365 /*
4366 * Update page checksum if desired. Since we have only shared lock on the
4367 * buffer, other processes might be updating hint bits in it, so we must
4368 * copy the page to private storage if we do checksumming.
4369 */
4370 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4371
4373
4374 /*
4375 * bufToWrite is either the shared buffer or a copy, as appropriate.
4376 */
4377 smgrwrite(reln,
4378 BufTagGetForkNum(&buf->tag),
4379 buf->tag.blockNum,
4380 bufToWrite,
4381 false);
4382
4383 /*
4384 * When a strategy is in use, only flushes of dirty buffers already in the
4385 * strategy ring are counted as strategy writes (IOCONTEXT
4386 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4387 * statistics tracking.
4388 *
4389 * If a shared buffer initially added to the ring must be flushed before
4390 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4391 *
4392 * If a shared buffer which was added to the ring later because the
4393 * current strategy buffer is pinned or in use or because all strategy
4394 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4395 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4396 * (from_ring will be false).
4397 *
4398 * When a strategy is not in use, the write can only be a "regular" write
4399 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4400 */
4402 IOOP_WRITE, io_start, 1, BLCKSZ);
4403
4405
4406 /*
4407 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4408 * end the BM_IO_IN_PROGRESS state.
4409 */
4410 TerminateBufferIO(buf, true, 0, true, false);
4411
4412 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4413 buf->tag.blockNum,
4417
4418 /* Pop the error context stack */
4419 error_context_stack = errcallback.previous;
4420}
4421
4422/*
4423 * RelationGetNumberOfBlocksInFork
4424 * Determines the current number of pages in the specified relation fork.
4425 *
4426 * Note that the accuracy of the result will depend on the details of the
4427 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4428 * it might not be.
4429 */
4432{
4433 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4434 {
4435 /*
4436 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4437 * tableam returns the size in bytes - but for the purpose of this
4438 * routine, we want the number of blocks. Therefore divide, rounding
4439 * up.
4440 */
4441 uint64 szbytes;
4442
4443 szbytes = table_relation_size(relation, forkNum);
4444
4445 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4446 }
4447 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4448 {
4449 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4450 }
4451 else
4452 Assert(false);
4453
4454 return 0; /* keep compiler quiet */
4455}
4456
4457/*
4458 * BufferIsPermanent
4459 * Determines whether a buffer will potentially still be around after
4460 * a crash. Caller must hold a buffer pin.
4461 */
4462bool
4464{
4465 BufferDesc *bufHdr;
4466
4467 /* Local buffers are used only for temp relations. */
4468 if (BufferIsLocal(buffer))
4469 return false;
4470
4471 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4474
4475 /*
4476 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4477 * need not bother with the buffer header spinlock. Even if someone else
4478 * changes the buffer header state while we're doing this, the state is
4479 * changed atomically, so we'll read the old value or the new value, but
4480 * not random garbage.
4481 */
4482 bufHdr = GetBufferDescriptor(buffer - 1);
4483 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4484}
4485
4486/*
4487 * BufferGetLSNAtomic
4488 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4489 * This is necessary for some callers who may not have an exclusive lock
4490 * on the buffer.
4491 */
4494{
4495 char *page = BufferGetPage(buffer);
4496 BufferDesc *bufHdr;
4497 XLogRecPtr lsn;
4498 uint32 buf_state;
4499
4500 /*
4501 * If we don't need locking for correctness, fastpath out.
4502 */
4504 return PageGetLSN(page);
4505
4506 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4509
4510 bufHdr = GetBufferDescriptor(buffer - 1);
4511 buf_state = LockBufHdr(bufHdr);
4512 lsn = PageGetLSN(page);
4513 UnlockBufHdr(bufHdr, buf_state);
4514
4515 return lsn;
4516}
4517
4518/* ---------------------------------------------------------------------
4519 * DropRelationBuffers
4520 *
4521 * This function removes from the buffer pool all the pages of the
4522 * specified relation forks that have block numbers >= firstDelBlock.
4523 * (In particular, with firstDelBlock = 0, all pages are removed.)
4524 * Dirty pages are simply dropped, without bothering to write them
4525 * out first. Therefore, this is NOT rollback-able, and so should be
4526 * used only with extreme caution!
4527 *
4528 * Currently, this is called only from smgr.c when the underlying file
4529 * is about to be deleted or truncated (firstDelBlock is needed for
4530 * the truncation case). The data in the affected pages would therefore
4531 * be deleted momentarily anyway, and there is no point in writing it.
4532 * It is the responsibility of higher-level code to ensure that the
4533 * deletion or truncation does not lose any data that could be needed
4534 * later. It is also the responsibility of higher-level code to ensure
4535 * that no other process could be trying to load more pages of the
4536 * relation into buffers.
4537 * --------------------------------------------------------------------
4538 */
4539void
4541 int nforks, BlockNumber *firstDelBlock)
4542{
4543 int i;
4544 int j;
4545 RelFileLocatorBackend rlocator;
4546 BlockNumber nForkBlock[MAX_FORKNUM];
4547 uint64 nBlocksToInvalidate = 0;
4548
4549 rlocator = smgr_reln->smgr_rlocator;
4550
4551 /* If it's a local relation, it's localbuf.c's problem. */
4552 if (RelFileLocatorBackendIsTemp(rlocator))
4553 {
4554 if (rlocator.backend == MyProcNumber)
4555 {
4556 for (j = 0; j < nforks; j++)
4557 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4558 firstDelBlock[j]);
4559 }
4560 return;
4561 }
4562
4563 /*
4564 * To remove all the pages of the specified relation forks from the buffer
4565 * pool, we need to scan the entire buffer pool but we can optimize it by
4566 * finding the buffers from BufMapping table provided we know the exact
4567 * size of each fork of the relation. The exact size is required to ensure
4568 * that we don't leave any buffer for the relation being dropped as
4569 * otherwise the background writer or checkpointer can lead to a PANIC
4570 * error while flushing buffers corresponding to files that don't exist.
4571 *
4572 * To know the exact size, we rely on the size cached for each fork by us
4573 * during recovery which limits the optimization to recovery and on
4574 * standbys but we can easily extend it once we have shared cache for
4575 * relation size.
4576 *
4577 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4578 * and the future writes keeps the cached value up-to-date. See
4579 * smgrextend. It is possible that the value of the first lseek is smaller
4580 * than the actual number of existing blocks in the file due to buggy
4581 * Linux kernels that might not have accounted for the recent write. But
4582 * that should be fine because there must not be any buffers after that
4583 * file size.
4584 */
4585 for (i = 0; i < nforks; i++)
4586 {
4587 /* Get the number of blocks for a relation's fork */
4588 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4589
4590 if (nForkBlock[i] == InvalidBlockNumber)
4591 {
4592 nBlocksToInvalidate = InvalidBlockNumber;
4593 break;
4594 }
4595
4596 /* calculate the number of blocks to be invalidated */
4597 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4598 }
4599
4600 /*
4601 * We apply the optimization iff the total number of blocks to invalidate
4602 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4603 */
4604 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4605 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4606 {
4607 for (j = 0; j < nforks; j++)
4608 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4609 nForkBlock[j], firstDelBlock[j]);
4610 return;
4611 }
4612
4613 for (i = 0; i < NBuffers; i++)
4614 {
4615 BufferDesc *bufHdr = GetBufferDescriptor(i);
4616 uint32 buf_state;
4617
4618 /*
4619 * We can make this a tad faster by prechecking the buffer tag before
4620 * we attempt to lock the buffer; this saves a lot of lock
4621 * acquisitions in typical cases. It should be safe because the
4622 * caller must have AccessExclusiveLock on the relation, or some other
4623 * reason to be certain that no one is loading new pages of the rel
4624 * into the buffer pool. (Otherwise we might well miss such pages
4625 * entirely.) Therefore, while the tag might be changing while we
4626 * look at it, it can't be changing *to* a value we care about, only
4627 * *away* from such a value. So false negatives are impossible, and
4628 * false positives are safe because we'll recheck after getting the
4629 * buffer lock.
4630 *
4631 * We could check forkNum and blockNum as well as the rlocator, but
4632 * the incremental win from doing so seems small.
4633 */
4634 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4635 continue;
4636
4637 buf_state = LockBufHdr(bufHdr);
4638
4639 for (j = 0; j < nforks; j++)
4640 {
4641 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4642 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4643 bufHdr->tag.blockNum >= firstDelBlock[j])
4644 {
4645 InvalidateBuffer(bufHdr); /* releases spinlock */
4646 break;
4647 }
4648 }
4649 if (j >= nforks)
4650 UnlockBufHdr(bufHdr, buf_state);
4651 }
4652}
4653
4654/* ---------------------------------------------------------------------
4655 * DropRelationsAllBuffers
4656 *
4657 * This function removes from the buffer pool all the pages of all
4658 * forks of the specified relations. It's equivalent to calling
4659 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4660 * --------------------------------------------------------------------
4661 */
4662void
4663DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4664{
4665 int i;
4666 int n = 0;
4667 SMgrRelation *rels;
4668 BlockNumber (*block)[MAX_FORKNUM + 1];
4669 uint64 nBlocksToInvalidate = 0;
4670 RelFileLocator *locators;
4671 bool cached = true;
4672 bool use_bsearch;
4673
4674 if (nlocators == 0)
4675 return;
4676
4677 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4678
4679 /* If it's a local relation, it's localbuf.c's problem. */
4680 for (i = 0; i < nlocators; i++)
4681 {
4682 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4683 {
4684 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4685 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4686 }
4687 else
4688 rels[n++] = smgr_reln[i];
4689 }
4690
4691 /*
4692 * If there are no non-local relations, then we're done. Release the
4693 * memory and return.
4694 */
4695 if (n == 0)
4696 {
4697 pfree(rels);
4698 return;
4699 }
4700
4701 /*
4702 * This is used to remember the number of blocks for all the relations
4703 * forks.
4704 */
4705 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4706 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4707
4708 /*
4709 * We can avoid scanning the entire buffer pool if we know the exact size
4710 * of each of the given relation forks. See DropRelationBuffers.
4711 */
4712 for (i = 0; i < n && cached; i++)
4713 {
4714 for (int j = 0; j <= MAX_FORKNUM; j++)
4715 {
4716 /* Get the number of blocks for a relation's fork. */
4717 block[i][j] = smgrnblocks_cached(rels[i], j);
4718
4719 /* We need to only consider the relation forks that exists. */
4720 if (block[i][j] == InvalidBlockNumber)
4721 {
4722 if (!smgrexists(rels[i], j))
4723 continue;
4724 cached = false;
4725 break;
4726 }
4727
4728 /* calculate the total number of blocks to be invalidated */
4729 nBlocksToInvalidate += block[i][j];
4730 }
4731 }
4732
4733 /*
4734 * We apply the optimization iff the total number of blocks to invalidate
4735 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4736 */
4737 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4738 {
4739 for (i = 0; i < n; i++)
4740 {
4741 for (int j = 0; j <= MAX_FORKNUM; j++)
4742 {
4743 /* ignore relation forks that doesn't exist */
4744 if (!BlockNumberIsValid(block[i][j]))
4745 continue;
4746
4747 /* drop all the buffers for a particular relation fork */
4748 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4749 j, block[i][j], 0);
4750 }
4751 }
4752
4753 pfree(block);
4754 pfree(rels);
4755 return;
4756 }
4757
4758 pfree(block);
4759 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4760 for (i = 0; i < n; i++)
4761 locators[i] = rels[i]->smgr_rlocator.locator;
4762
4763 /*
4764 * For low number of relations to drop just use a simple walk through, to
4765 * save the bsearch overhead. The threshold to use is rather a guess than
4766 * an exactly determined value, as it depends on many factors (CPU and RAM
4767 * speeds, amount of shared buffers etc.).
4768 */
4769 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4770
4771 /* sort the list of rlocators if necessary */
4772 if (use_bsearch)
4773 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4774
4775 for (i = 0; i < NBuffers; i++)
4776 {
4777 RelFileLocator *rlocator = NULL;
4778 BufferDesc *bufHdr = GetBufferDescriptor(i);
4779 uint32 buf_state;
4780
4781 /*
4782 * As in DropRelationBuffers, an unlocked precheck should be safe and
4783 * saves some cycles.
4784 */
4785
4786 if (!use_bsearch)
4787 {
4788 int j;
4789
4790 for (j = 0; j < n; j++)
4791 {
4792 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4793 {
4794 rlocator = &locators[j];
4795 break;
4796 }
4797 }
4798 }
4799 else
4800 {
4801 RelFileLocator locator;
4802
4803 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4804 rlocator = bsearch(&locator,
4805 locators, n, sizeof(RelFileLocator),
4807 }
4808
4809 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4810 if (rlocator == NULL)
4811 continue;
4812
4813 buf_state = LockBufHdr(bufHdr);
4814 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4815 InvalidateBuffer(bufHdr); /* releases spinlock */
4816 else
4817 UnlockBufHdr(bufHdr, buf_state);
4818 }
4819
4820 pfree(locators);
4821 pfree(rels);
4822}
4823
4824/* ---------------------------------------------------------------------
4825 * FindAndDropRelationBuffers
4826 *
4827 * This function performs look up in BufMapping table and removes from the
4828 * buffer pool all the pages of the specified relation fork that has block
4829 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4830 * pages are removed.)
4831 * --------------------------------------------------------------------
4832 */
4833static void
4835 BlockNumber nForkBlock,
4836 BlockNumber firstDelBlock)
4837{
4838 BlockNumber curBlock;
4839
4840 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4841 {
4842 uint32 bufHash; /* hash value for tag */
4843 BufferTag bufTag; /* identity of requested block */
4844 LWLock *bufPartitionLock; /* buffer partition lock for it */
4845 int buf_id;
4846 BufferDesc *bufHdr;
4847 uint32 buf_state;
4848
4849 /* create a tag so we can lookup the buffer */
4850 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4851
4852 /* determine its hash code and partition lock ID */
4853 bufHash = BufTableHashCode(&bufTag);
4854 bufPartitionLock = BufMappingPartitionLock(bufHash);
4855
4856 /* Check that it is in the buffer pool. If not, do nothing. */
4857 LWLockAcquire(bufPartitionLock, LW_SHARED);
4858 buf_id = BufTableLookup(&bufTag, bufHash);
4859 LWLockRelease(bufPartitionLock);
4860
4861 if (buf_id < 0)
4862 continue;
4863
4864 bufHdr = GetBufferDescriptor(buf_id);
4865
4866 /*
4867 * We need to lock the buffer header and recheck if the buffer is
4868 * still associated with the same block because the buffer could be
4869 * evicted by some other backend loading blocks for a different
4870 * relation after we release lock on the BufMapping table.
4871 */
4872 buf_state = LockBufHdr(bufHdr);
4873
4874 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4875 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4876 bufHdr->tag.blockNum >= firstDelBlock)
4877 InvalidateBuffer(bufHdr); /* releases spinlock */
4878 else
4879 UnlockBufHdr(bufHdr, buf_state);
4880 }
4881}
4882
4883/* ---------------------------------------------------------------------
4884 * DropDatabaseBuffers
4885 *
4886 * This function removes all the buffers in the buffer cache for a
4887 * particular database. Dirty pages are simply dropped, without
4888 * bothering to write them out first. This is used when we destroy a
4889 * database, to avoid trying to flush data to disk when the directory
4890 * tree no longer exists. Implementation is pretty similar to
4891 * DropRelationBuffers() which is for destroying just one relation.
4892 * --------------------------------------------------------------------
4893 */
4894void
4896{
4897 int i;
4898
4899 /*
4900 * We needn't consider local buffers, since by assumption the target
4901 * database isn't our own.
4902 */
4903
4904 for (i = 0; i < NBuffers; i++)
4905 {
4906 BufferDesc *bufHdr = GetBufferDescriptor(i);
4907 uint32 buf_state;
4908
4909 /*
4910 * As in DropRelationBuffers, an unlocked precheck should be safe and
4911 * saves some cycles.
4912 */
4913 if (bufHdr->tag.dbOid != dbid)
4914 continue;
4915
4916 buf_state = LockBufHdr(bufHdr);
4917 if (bufHdr->tag.dbOid == dbid)
4918 InvalidateBuffer(bufHdr); /* releases spinlock */
4919 else
4920 UnlockBufHdr(bufHdr, buf_state);
4921 }
4922}
4923
4924/* ---------------------------------------------------------------------
4925 * FlushRelationBuffers
4926 *
4927 * This function writes all dirty pages of a relation out to disk
4928 * (or more accurately, out to kernel disk buffers), ensuring that the
4929 * kernel has an up-to-date view of the relation.
4930 *
4931 * Generally, the caller should be holding AccessExclusiveLock on the
4932 * target relation to ensure that no other backend is busy dirtying
4933 * more blocks of the relation; the effects can't be expected to last
4934 * after the lock is released.
4935 *
4936 * XXX currently it sequentially searches the buffer pool, should be
4937 * changed to more clever ways of searching. This routine is not
4938 * used in any performance-critical code paths, so it's not worth
4939 * adding additional overhead to normal paths to make it go faster.
4940 * --------------------------------------------------------------------
4941 */
4942void
4944{
4945 int i;
4946 BufferDesc *bufHdr;
4947 SMgrRelation srel = RelationGetSmgr(rel);
4948
4949 if (RelationUsesLocalBuffers(rel))
4950 {
4951 for (i = 0; i < NLocBuffer; i++)
4952 {
4953 uint32 buf_state;
4954
4955 bufHdr = GetLocalBufferDescriptor(i);
4956 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4957 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4958 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4959 {
4960 ErrorContextCallback errcallback;
4961
4962 /* Setup error traceback support for ereport() */
4964 errcallback.arg = bufHdr;
4965 errcallback.previous = error_context_stack;
4966 error_context_stack = &errcallback;
4967
4968 /* Make sure we can handle the pin */
4971
4972 /*
4973 * Pin/unpin mostly to make valgrind work, but it also seems
4974 * like the right thing to do.
4975 */
4976 PinLocalBuffer(bufHdr, false);
4977
4978
4979 FlushLocalBuffer(bufHdr, srel);
4980
4982
4983 /* Pop the error context stack */
4984 error_context_stack = errcallback.previous;
4985 }
4986 }
4987
4988 return;
4989 }
4990
4991 for (i = 0; i < NBuffers; i++)
4992 {
4993 uint32 buf_state;
4994
4995 bufHdr = GetBufferDescriptor(i);
4996
4997 /*
4998 * As in DropRelationBuffers, an unlocked precheck should be safe and
4999 * saves some cycles.
5000 */
5001 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5002 continue;
5003
5004 /* Make sure we can handle the pin */
5007
5008 buf_state = LockBufHdr(bufHdr);
5009 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5010 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5011 {
5012 PinBuffer_Locked(bufHdr);
5016 UnpinBuffer(bufHdr);
5017 }
5018 else
5019 UnlockBufHdr(bufHdr, buf_state);
5020 }
5021}
5022
5023/* ---------------------------------------------------------------------
5024 * FlushRelationsAllBuffers
5025 *
5026 * This function flushes out of the buffer pool all the pages of all
5027 * forks of the specified smgr relations. It's equivalent to calling
5028 * FlushRelationBuffers once per relation. The relations are assumed not
5029 * to use local buffers.
5030 * --------------------------------------------------------------------
5031 */
5032void
5034{
5035 int i;
5036 SMgrSortArray *srels;
5037 bool use_bsearch;
5038
5039 if (nrels == 0)
5040 return;
5041
5042 /* fill-in array for qsort */
5043 srels = palloc(sizeof(SMgrSortArray) * nrels);
5044
5045 for (i = 0; i < nrels; i++)
5046 {
5047 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5048
5049 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5050 srels[i].srel = smgrs[i];
5051 }
5052
5053 /*
5054 * Save the bsearch overhead for low number of relations to sync. See
5055 * DropRelationsAllBuffers for details.
5056 */
5057 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5058
5059 /* sort the list of SMgrRelations if necessary */
5060 if (use_bsearch)
5061 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5062
5063 for (i = 0; i < NBuffers; i++)
5064 {
5065 SMgrSortArray *srelent = NULL;
5066 BufferDesc *bufHdr = GetBufferDescriptor(i);
5067 uint32 buf_state;
5068
5069 /*
5070 * As in DropRelationBuffers, an unlocked precheck should be safe and
5071 * saves some cycles.
5072 */
5073
5074 if (!use_bsearch)
5075 {
5076 int j;
5077
5078 for (j = 0; j < nrels; j++)
5079 {
5080 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5081 {
5082 srelent = &srels[j];
5083 break;
5084 }
5085 }
5086 }
5087 else
5088 {
5089 RelFileLocator rlocator;
5090
5091 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5092 srelent = bsearch(&rlocator,
5093 srels, nrels, sizeof(SMgrSortArray),
5095 }
5096
5097 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5098 if (srelent == NULL)
5099 continue;
5100
5101 /* Make sure we can handle the pin */
5104
5105 buf_state = LockBufHdr(bufHdr);
5106 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5107 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5108 {
5109 PinBuffer_Locked(bufHdr);
5113 UnpinBuffer(bufHdr);
5114 }
5115 else
5116 UnlockBufHdr(bufHdr, buf_state);
5117 }
5118
5119 pfree(srels);
5120}
5121
5122/* ---------------------------------------------------------------------
5123 * RelationCopyStorageUsingBuffer
5124 *
5125 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5126 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5127 *
5128 * Refer comments atop CreateAndCopyRelationData() for details about
5129 * 'permanent' parameter.
5130 * --------------------------------------------------------------------
5131 */
5132static void
5134 RelFileLocator dstlocator,
5135 ForkNumber forkNum, bool permanent)
5136{
5137 Buffer srcBuf;
5138 Buffer dstBuf;
5139 Page srcPage;
5140 Page dstPage;
5141 bool use_wal;
5142 BlockNumber nblocks;
5143 BlockNumber blkno;
5145 BufferAccessStrategy bstrategy_src;
5146 BufferAccessStrategy bstrategy_dst;
5148 ReadStream *src_stream;
5149 SMgrRelation src_smgr;
5150
5151 /*
5152 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5153 * can skip it when copying any fork of an unlogged relation other than
5154 * the init fork.
5155 */
5156 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5157
5158 /* Get number of blocks in the source relation. */
5159 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5160 forkNum);
5161
5162 /* Nothing to copy; just return. */
5163 if (nblocks == 0)
5164 return;
5165
5166 /*
5167 * Bulk extend the destination relation of the same size as the source
5168 * relation before starting to copy block by block.
5169 */
5170 memset(buf.data, 0, BLCKSZ);
5171 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5172 buf.data, true);
5173
5174 /* This is a bulk operation, so use buffer access strategies. */
5175 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5176 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5177
5178 /* Initialize streaming read */
5179 p.current_blocknum = 0;
5180 p.last_exclusive = nblocks;
5181 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5182
5183 /*
5184 * It is safe to use batchmode as block_range_read_stream_cb takes no
5185 * locks.
5186 */
5189 bstrategy_src,
5190 src_smgr,
5191 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5192 forkNum,
5194 &p,
5195 0);
5196
5197 /* Iterate over each block of the source relation file. */
5198 for (blkno = 0; blkno < nblocks; blkno++)
5199 {
5201
5202 /* Read block from source relation. */
5203 srcBuf = read_stream_next_buffer(src_stream, NULL);
5205 srcPage = BufferGetPage(srcBuf);
5206
5207 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5208 BufferGetBlockNumber(srcBuf),
5209 RBM_ZERO_AND_LOCK, bstrategy_dst,
5210 permanent);
5211 dstPage = BufferGetPage(dstBuf);
5212
5214
5215 /* Copy page data from the source to the destination. */
5216 memcpy(dstPage, srcPage, BLCKSZ);
5217 MarkBufferDirty(dstBuf);
5218
5219 /* WAL-log the copied page. */
5220 if (use_wal)
5221 log_newpage_buffer(dstBuf, true);
5222
5224
5225 UnlockReleaseBuffer(dstBuf);
5226 UnlockReleaseBuffer(srcBuf);
5227 }
5228 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5229 read_stream_end(src_stream);
5230
5231 FreeAccessStrategy(bstrategy_src);
5232 FreeAccessStrategy(bstrategy_dst);
5233}
5234
5235/* ---------------------------------------------------------------------
5236 * CreateAndCopyRelationData
5237 *
5238 * Create destination relation storage and copy all forks from the
5239 * source relation to the destination.
5240 *
5241 * Pass permanent as true for permanent relations and false for
5242 * unlogged relations. Currently this API is not supported for
5243 * temporary relations.
5244 * --------------------------------------------------------------------
5245 */
5246void
5248 RelFileLocator dst_rlocator, bool permanent)
5249{
5250 char relpersistence;
5251 SMgrRelation src_rel;
5252 SMgrRelation dst_rel;
5253
5254 /* Set the relpersistence. */
5255 relpersistence = permanent ?
5256 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5257
5258 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5259 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5260
5261 /*
5262 * Create and copy all forks of the relation. During create database we
5263 * have a separate cleanup mechanism which deletes complete database
5264 * directory. Therefore, each individual relation doesn't need to be
5265 * registered for cleanup.
5266 */
5267 RelationCreateStorage(dst_rlocator, relpersistence, false);
5268
5269 /* copy main fork. */
5270 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5271 permanent);
5272
5273 /* copy those extra forks that exist */
5274 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5275 forkNum <= MAX_FORKNUM; forkNum++)
5276 {
5277 if (smgrexists(src_rel, forkNum))
5278 {
5279 smgrcreate(dst_rel, forkNum, false);
5280
5281 /*
5282 * WAL log creation if the relation is persistent, or this is the
5283 * init fork of an unlogged relation.
5284 */
5285 if (permanent || forkNum == INIT_FORKNUM)
5286 log_smgrcreate(&dst_rlocator, forkNum);
5287
5288 /* Copy a fork's data, block by block. */
5289 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5290 permanent);
5291 }
5292 }
5293}
5294
5295/* ---------------------------------------------------------------------
5296 * FlushDatabaseBuffers
5297 *
5298 * This function writes all dirty pages of a database out to disk
5299 * (or more accurately, out to kernel disk buffers), ensuring that the
5300 * kernel has an up-to-date view of the database.
5301 *
5302 * Generally, the caller should be holding an appropriate lock to ensure
5303 * no other backend is active in the target database; otherwise more
5304 * pages could get dirtied.
5305 *
5306 * Note we don't worry about flushing any pages of temporary relations.
5307 * It's assumed these wouldn't be interesting.
5308 * --------------------------------------------------------------------
5309 */
5310void
5312{
5313 int i;
5314 BufferDesc *bufHdr;
5315
5316 for (i = 0; i < NBuffers; i++)
5317 {
5318 uint32 buf_state;
5319
5320 bufHdr = GetBufferDescriptor(i);
5321
5322 /*
5323 * As in DropRelationBuffers, an unlocked precheck should be safe and
5324 * saves some cycles.
5325 */
5326 if (bufHdr->tag.dbOid != dbid)
5327 continue;
5328
5329 /* Make sure we can handle the pin */
5332
5333 buf_state = LockBufHdr(bufHdr);
5334 if (bufHdr->tag.dbOid == dbid &&
5335 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5336 {
5337 PinBuffer_Locked(bufHdr);
5341 UnpinBuffer(bufHdr);
5342 }
5343 else
5344 UnlockBufHdr(bufHdr, buf_state);
5345 }
5346}
5347
5348/*
5349 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5350 * OS.
5351 */
5352void
5354{
5355 BufferDesc *bufHdr;
5356
5357 /* currently not needed, but no fundamental reason not to support */
5359
5361
5362 bufHdr = GetBufferDescriptor(buffer - 1);
5363
5365
5367}
5368
5369/*
5370 * ReleaseBuffer -- release the pin on a buffer
5371 */
5372void
5374{
5375 if (!BufferIsValid(buffer))
5376 elog(ERROR, "bad buffer ID: %d", buffer);
5377
5378 if (BufferIsLocal(buffer))
5380 else
5382}
5383
5384/*
5385 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5386 *
5387 * This is just a shorthand for a common combination.
5388 */
5389void
5391{
5394}
5395
5396/*
5397 * IncrBufferRefCount
5398 * Increment the pin count on a buffer that we have *already* pinned
5399 * at least once.
5400 *
5401 * This function cannot be used on a buffer we do not have pinned,
5402 * because it doesn't change the shared buffer state.
5403 */
5404void
5406{
5409 if (BufferIsLocal(buffer))
5410 LocalRefCount[-buffer - 1]++;
5411 else
5412 {
5414
5415 ref = GetPrivateRefCountEntry(buffer, true);
5416 Assert(ref != NULL);
5417 ref->refcount++;
5418 }
5420}
5421
5422/*
5423 * MarkBufferDirtyHint
5424 *
5425 * Mark a buffer dirty for non-critical changes.
5426 *
5427 * This is essentially the same as MarkBufferDirty, except:
5428 *
5429 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5430 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5431 * 2. The caller might have only share-lock instead of exclusive-lock on the
5432 * buffer's content lock.
5433 * 3. This function does not guarantee that the buffer is always marked dirty
5434 * (due to a race condition), so it cannot be used for important changes.
5435 */
5436void
5438{
5439 BufferDesc *bufHdr;
5440 Page page = BufferGetPage(buffer);
5441
5442 if (!BufferIsValid(buffer))
5443 elog(ERROR, "bad buffer ID: %d", buffer);
5444
5445 if (BufferIsLocal(buffer))
5446 {
5448 return;
5449 }
5450
5451 bufHdr = GetBufferDescriptor(buffer - 1);
5452
5454 /* here, either share or exclusive lock is OK */
5456
5457 /*
5458 * This routine might get called many times on the same page, if we are
5459 * making the first scan after commit of an xact that added/deleted many
5460 * tuples. So, be as quick as we can if the buffer is already dirty. We
5461 * do this by not acquiring spinlock if it looks like the status bits are
5462 * already set. Since we make this test unlocked, there's a chance we
5463 * might fail to notice that the flags have just been cleared, and failed
5464 * to reset them, due to memory-ordering issues. But since this function
5465 * is only intended to be used in cases where failing to write out the
5466 * data would be harmless anyway, it doesn't really matter.
5467 */
5468 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5470 {
5472 bool dirtied = false;
5473 bool delayChkptFlags = false;
5474 uint32 buf_state;
5475
5476 /*
5477 * If we need to protect hint bit updates from torn writes, WAL-log a
5478 * full page image of the page. This full page image is only necessary
5479 * if the hint bit update is the first change to the page since the
5480 * last checkpoint.
5481 *
5482 * We don't check full_page_writes here because that logic is included
5483 * when we call XLogInsert() since the value changes dynamically.
5484 */
5485 if (XLogHintBitIsNeeded() &&
5487 {
5488 /*
5489 * If we must not write WAL, due to a relfilelocator-specific
5490 * condition or being in recovery, don't dirty the page. We can
5491 * set the hint, just not dirty the page as a result so the hint
5492 * is lost when we evict the page or shutdown.
5493 *
5494 * See src/backend/storage/page/README for longer discussion.
5495 */
5496 if (RecoveryInProgress() ||
5498 return;
5499
5500 /*
5501 * If the block is already dirty because we either made a change
5502 * or set a hint already, then we don't need to write a full page
5503 * image. Note that aggressive cleaning of blocks dirtied by hint
5504 * bit setting would increase the call rate. Bulk setting of hint
5505 * bits would reduce the call rate...
5506 *
5507 * We must issue the WAL record before we mark the buffer dirty.
5508 * Otherwise we might write the page before we write the WAL. That
5509 * causes a race condition, since a checkpoint might occur between
5510 * writing the WAL record and marking the buffer dirty. We solve
5511 * that with a kluge, but one that is already in use during
5512 * transaction commit to prevent race conditions. Basically, we
5513 * simply prevent the checkpoint WAL record from being written
5514 * until we have marked the buffer dirty. We don't start the
5515 * checkpoint flush until we have marked dirty, so our checkpoint
5516 * must flush the change to disk successfully or the checkpoint
5517 * never gets written, so crash recovery will fix.
5518 *
5519 * It's possible we may enter here without an xid, so it is
5520 * essential that CreateCheckPoint waits for virtual transactions
5521 * rather than full transactionids.
5522 */
5525 delayChkptFlags = true;
5526 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5527 }
5528
5529 buf_state = LockBufHdr(bufHdr);
5530
5531 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5532
5533 if (!(buf_state & BM_DIRTY))
5534 {
5535 dirtied = true; /* Means "will be dirtied by this action" */
5536
5537 /*
5538 * Set the page LSN if we wrote a backup block. We aren't supposed
5539 * to set this when only holding a share lock but as long as we
5540 * serialise it somehow we're OK. We choose to set LSN while
5541 * holding the buffer header lock, which causes any reader of an
5542 * LSN who holds only a share lock to also obtain a buffer header
5543 * lock before using PageGetLSN(), which is enforced in
5544 * BufferGetLSNAtomic().
5545 *
5546 * If checksums are enabled, you might think we should reset the
5547 * checksum here. That will happen when the page is written
5548 * sometime later in this checkpoint cycle.
5549 */
5550 if (!XLogRecPtrIsInvalid(lsn))
5551 PageSetLSN(page, lsn);
5552 }
5553
5554 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5555 UnlockBufHdr(bufHdr, buf_state);
5556
5557 if (delayChkptFlags)
5558 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5559
5560 if (dirtied)
5561 {
5563 if (VacuumCostActive)
5565 }
5566 }
5567}
5568
5569/*
5570 * Release buffer content locks for shared buffers.
5571 *
5572 * Used to clean up after errors.
5573 *
5574 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5575 * of releasing buffer content locks per se; the only thing we need to deal
5576 * with here is clearing any PIN_COUNT request that was in progress.
5577 */
5578void
5580{
5582
5583 if (buf)
5584 {
5585 uint32 buf_state;
5586
5587 buf_state = LockBufHdr(buf);
5588
5589 /*
5590 * Don't complain if flag bit not set; it could have been reset but we
5591 * got a cancel/die interrupt before getting the signal.
5592 */
5593 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5594 buf->wait_backend_pgprocno == MyProcNumber)
5595 buf_state &= ~BM_PIN_COUNT_WAITER;
5596
5597 UnlockBufHdr(buf, buf_state);
5598
5599 PinCountWaitBuf = NULL;
5600 }
5601}
5602
5603/*
5604 * Acquire or release the content_lock for the buffer.
5605 */
5606void
5608{
5609 BufferDesc *buf;
5610
5612 if (BufferIsLocal(buffer))
5613 return; /* local buffers need no lock */
5614
5616
5617 if (mode == BUFFER_LOCK_UNLOCK)
5619 else if (mode == BUFFER_LOCK_SHARE)
5621 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5623 else
5624 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5625}
5626
5627/*
5628 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5629 *
5630 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5631 */
5632bool
5634{
5635 BufferDesc *buf;
5636
5638 if (BufferIsLocal(buffer))
5639 return true; /* act as though we got it */
5640
5642
5644 LW_EXCLUSIVE);
5645}
5646
5647/*
5648 * Verify that this backend is pinning the buffer exactly once.
5649 *
5650 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5651 * holds a pin on the buffer. We do not care whether some other backend does.
5652 */
5653void
5655{
5656 if (BufferIsLocal(buffer))
5657 {
5658 if (LocalRefCount[-buffer - 1] != 1)
5659 elog(ERROR, "incorrect local pin count: %d",
5660 LocalRefCount[-buffer - 1]);
5661 }
5662 else
5663 {
5664 if (GetPrivateRefCount(buffer) != 1)
5665 elog(ERROR, "incorrect local pin count: %d",
5667 }
5668}
5669
5670/*
5671 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5672 *
5673 * Items may be deleted from a disk page only when the caller (a) holds an
5674 * exclusive lock on the buffer and (b) has observed that no other backend
5675 * holds a pin on the buffer. If there is a pin, then the other backend
5676 * might have a pointer into the buffer (for example, a heapscan reference
5677 * to an item --- see README for more details). It's OK if a pin is added
5678 * after the cleanup starts, however; the newly-arrived backend will be
5679 * unable to look at the page until we release the exclusive lock.
5680 *
5681 * To implement this protocol, a would-be deleter must pin the buffer and
5682 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5683 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5684 * it has successfully observed pin count = 1.
5685 */
5686void
5688{
5689 BufferDesc *bufHdr;
5690 TimestampTz waitStart = 0;
5691 bool waiting = false;
5692 bool logged_recovery_conflict = false;
5693
5695 Assert(PinCountWaitBuf == NULL);
5696
5698
5699 /*
5700 * We do not yet need to be worried about in-progress AIOs holding a pin,
5701 * as we, so far, only support doing reads via AIO and this function can
5702 * only be called once the buffer is valid (i.e. no read can be in
5703 * flight).
5704 */
5705
5706 /* Nobody else to wait for */
5707 if (BufferIsLocal(buffer))
5708 return;
5709
5710 bufHdr = GetBufferDescriptor(buffer - 1);
5711
5712 for (;;)
5713 {
5714 uint32 buf_state;
5715
5716 /* Try to acquire lock */
5718 buf_state = LockBufHdr(bufHdr);
5719
5720 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5721 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5722 {
5723 /* Successfully acquired exclusive lock with pincount 1 */
5724 UnlockBufHdr(bufHdr, buf_state);
5725
5726 /*
5727 * Emit the log message if recovery conflict on buffer pin was
5728 * resolved but the startup process waited longer than
5729 * deadlock_timeout for it.
5730 */
5731 if (logged_recovery_conflict)
5733 waitStart, GetCurrentTimestamp(),
5734 NULL, false);
5735
5736 if (waiting)
5737 {
5738 /* reset ps display to remove the suffix if we added one */
5740 waiting = false;
5741 }
5742 return;
5743 }
5744 /* Failed, so mark myself as waiting for pincount 1 */
5745 if (buf_state & BM_PIN_COUNT_WAITER)
5746 {
5747 UnlockBufHdr(bufHdr, buf_state);
5749 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5750 }
5752 PinCountWaitBuf = bufHdr;
5753 buf_state |= BM_PIN_COUNT_WAITER;
5754 UnlockBufHdr(bufHdr, buf_state);
5756
5757 /* Wait to be signaled by UnpinBuffer() */
5758 if (InHotStandby)
5759 {
5760 if (!waiting)
5761 {
5762 /* adjust the process title to indicate that it's waiting */
5763 set_ps_display_suffix("waiting");
5764 waiting = true;
5765 }
5766
5767 /*
5768 * Emit the log message if the startup process is waiting longer
5769 * than deadlock_timeout for recovery conflict on buffer pin.
5770 *
5771 * Skip this if first time through because the startup process has
5772 * not started waiting yet in this case. So, the wait start
5773 * timestamp is set after this logic.
5774 */
5775 if (waitStart != 0 && !logged_recovery_conflict)
5776 {
5778
5779 if (TimestampDifferenceExceeds(waitStart, now,
5781 {
5783 waitStart, now, NULL, true);
5784 logged_recovery_conflict = true;
5785 }
5786 }
5787
5788 /*
5789 * Set the wait start timestamp if logging is enabled and first
5790 * time through.
5791 */
5792 if (log_recovery_conflict_waits && waitStart == 0)
5793 waitStart = GetCurrentTimestamp();
5794
5795 /* Publish the bufid that Startup process waits on */
5797 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5799 /* Reset the published bufid */
5801 }
5802 else
5803 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5804
5805 /*
5806 * Remove flag marking us as waiter. Normally this will not be set
5807 * anymore, but ProcWaitForSignal() can return for other signals as
5808 * well. We take care to only reset the flag if we're the waiter, as
5809 * theoretically another backend could have started waiting. That's
5810 * impossible with the current usages due to table level locking, but
5811 * better be safe.
5812 */
5813 buf_state = LockBufHdr(bufHdr);
5814 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5816 buf_state &= ~BM_PIN_COUNT_WAITER;
5817 UnlockBufHdr(bufHdr, buf_state);
5818
5819 PinCountWaitBuf = NULL;
5820 /* Loop back and try again */
5821 }
5822}
5823
5824/*
5825 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5826 * requests cancellation of all pin holders that are blocking it.
5827 */
5828bool
5830{
5831 int bufid = GetStartupBufferPinWaitBufId();
5832
5833 /*
5834 * If we get woken slowly then it's possible that the Startup process was
5835 * already woken by other backends before we got here. Also possible that
5836 * we get here by multiple interrupts or interrupts at inappropriate
5837 * times, so make sure we do nothing if the bufid is not set.
5838 */
5839 if (bufid < 0)
5840 return false;
5841
5842 if (GetPrivateRefCount(bufid + 1) > 0)
5843 return true;
5844
5845 return false;
5846}
5847
5848/*
5849 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5850 *
5851 * We won't loop, but just check once to see if the pin count is OK. If
5852 * not, return false with no lock held.
5853 */
5854bool
5856{
5857 BufferDesc *bufHdr;
5858 uint32 buf_state,
5859 refcount;
5860
5862
5863 /* see AIO related comment in LockBufferForCleanup() */
5864
5865 if (BufferIsLocal(buffer))
5866 {
5868 /* There should be exactly one pin */
5869 Assert(refcount > 0);
5870 if (refcount != 1)
5871 return false;
5872 /* Nobody else to wait for */
5873 return true;
5874 }
5875
5876 /* There should be exactly one local pin */
5879 if (refcount != 1)
5880 return false;
5881
5882 /* Try to acquire lock */
5884 return false;
5885
5886 bufHdr = GetBufferDescriptor(buffer - 1);
5887 buf_state = LockBufHdr(bufHdr);
5888 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5889
5890 Assert(refcount > 0);
5891 if (refcount == 1)
5892 {
5893 /* Successfully acquired exclusive lock with pincount 1 */
5894 UnlockBufHdr(bufHdr, buf_state);
5895 return true;
5896 }
5897
5898 /* Failed, so release the lock */
5899 UnlockBufHdr(bufHdr, buf_state);
5901 return false;
5902}
5903
5904/*
5905 * IsBufferCleanupOK - as above, but we already have the lock
5906 *
5907 * Check whether it's OK to perform cleanup on a buffer we've already
5908 * locked. If we observe that the pin count is 1, our exclusive lock
5909 * happens to be a cleanup lock, and we can proceed with anything that
5910 * would have been allowable had we sought a cleanup lock originally.
5911 */
5912bool
5914{
5915 BufferDesc *bufHdr;
5916 uint32 buf_state;
5917
5919
5920 /* see AIO related comment in LockBufferForCleanup() */
5921
5922 if (BufferIsLocal(buffer))
5923 {
5924 /* There should be exactly one pin */
5925 if (LocalRefCount[-buffer - 1] != 1)
5926 return false;
5927 /* Nobody else to wait for */
5928 return true;
5929 }
5930
5931 /* There should be exactly one local pin */
5932 if (GetPrivateRefCount(buffer) != 1)
5933 return false;
5934
5935 bufHdr = GetBufferDescriptor(buffer - 1);
5936
5937 /* caller must hold exclusive lock on buffer */
5939 LW_EXCLUSIVE));
5940
5941 buf_state = LockBufHdr(bufHdr);
5942
5943 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5944 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5945 {
5946 /* pincount is OK. */
5947 UnlockBufHdr(bufHdr, buf_state);
5948 return true;
5949 }
5950
5951 UnlockBufHdr(bufHdr, buf_state);
5952 return false;
5953}
5954
5955
5956/*
5957 * Functions for buffer I/O handling
5958 *
5959 * Also note that these are used only for shared buffers, not local ones.
5960 */
5961
5962/*
5963 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5964 */
5965static void
5967{
5969
5971 for (;;)
5972 {
5973 uint32 buf_state;
5974 PgAioWaitRef iow;
5975
5976 /*
5977 * It may not be necessary to acquire the spinlock to check the flag
5978 * here, but since this test is essential for correctness, we'd better
5979 * play it safe.
5980 */
5981 buf_state = LockBufHdr(buf);
5982
5983 /*
5984 * Copy the wait reference while holding the spinlock. This protects
5985 * against a concurrent TerminateBufferIO() in another backend from
5986 * clearing the wref while it's being read.
5987 */
5988 iow = buf->io_wref;
5989 UnlockBufHdr(buf, buf_state);
5990
5991 /* no IO in progress, we don't need to wait */
5992 if (!(buf_state & BM_IO_IN_PROGRESS))
5993 break;
5994
5995 /*
5996 * The buffer has asynchronous IO in progress, wait for it to
5997 * complete.
5998 */
5999 if (pgaio_wref_valid(&iow))
6000 {
6001 pgaio_wref_wait(&iow);
6002
6003 /*
6004 * The AIO subsystem internally uses condition variables and thus
6005 * might remove this backend from the BufferDesc's CV. While that
6006 * wouldn't cause a correctness issue (the first CV sleep just
6007 * immediately returns if not already registered), it seems worth
6008 * avoiding unnecessary loop iterations, given that we take care
6009 * to do so at the start of the function.
6010 */
6012 continue;
6013 }
6014
6015 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6016 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6017 }
6019}
6020
6021/*
6022 * StartBufferIO: begin I/O on this buffer
6023 * (Assumptions)
6024 * My process is executing no IO on this buffer
6025 * The buffer is Pinned
6026 *
6027 * In some scenarios multiple backends could attempt the same I/O operation
6028 * concurrently. If someone else has already started I/O on this buffer then
6029 * we will wait for completion of the IO using WaitIO().
6030 *
6031 * Input operations are only attempted on buffers that are not BM_VALID,
6032 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6033 * so we can always tell if the work is already done.
6034 *
6035 * Returns true if we successfully marked the buffer as I/O busy,
6036 * false if someone else already did the work.
6037 *
6038 * If nowait is true, then we don't wait for an I/O to be finished by another
6039 * backend. In that case, false indicates either that the I/O was already
6040 * finished, or is still in progress. This is useful for callers that want to
6041 * find out if they can perform the I/O as part of a larger operation, without
6042 * waiting for the answer or distinguishing the reasons why not.
6043 */
6044bool
6045StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6046{
6047 uint32 buf_state;
6048
6050
6051 for (;;)
6052 {
6053 buf_state = LockBufHdr(buf);
6054
6055 if (!(buf_state & BM_IO_IN_PROGRESS))
6056 break;
6057 UnlockBufHdr(buf, buf_state);
6058 if (nowait)
6059 return false;
6060 WaitIO(buf);
6061 }
6062
6063 /* Once we get here, there is definitely no I/O active on this buffer */
6064
6065 /* Check if someone else already did the I/O */
6066 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6067 {
6068 UnlockBufHdr(buf, buf_state);
6069 return false;
6070 }
6071
6072 buf_state |= BM_IO_IN_PROGRESS;
6073 UnlockBufHdr(buf, buf_state);
6074
6077
6078 return true;
6079}
6080
6081/*
6082 * TerminateBufferIO: release a buffer we were doing I/O on
6083 * (Assumptions)
6084 * My process is executing IO for the buffer
6085 * BM_IO_IN_PROGRESS bit is set for the buffer
6086 * The buffer is Pinned
6087 *
6088 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6089 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6090 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6091 * marking the buffer clean if it was re-dirtied while we were writing.
6092 *
6093 * set_flag_bits gets ORed into the buffer's flags. It must include
6094 * BM_IO_ERROR in a failure case. For successful completion it could
6095 * be 0, or BM_VALID if we just finished reading in the page.
6096 *
6097 * If forget_owner is true, we release the buffer I/O from the current
6098 * resource owner. (forget_owner=false is used when the resource owner itself
6099 * is being released)
6100 */
6101void
6102TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6103 bool forget_owner, bool release_aio)
6104{
6105 uint32 buf_state;
6106
6107 buf_state = LockBufHdr(buf);
6108
6109 Assert(buf_state & BM_IO_IN_PROGRESS);
6110 buf_state &= ~BM_IO_IN_PROGRESS;
6111
6112 /* Clear earlier errors, if this IO failed, it'll be marked again */
6113 buf_state &= ~BM_IO_ERROR;
6114
6115 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6116 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6117
6118 if (release_aio)
6119 {
6120 /* release ownership by the AIO subsystem */
6121 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6122 buf_state -= BUF_REFCOUNT_ONE;
6123 pgaio_wref_clear(&buf->io_wref);
6124 }
6125
6126 buf_state |= set_flag_bits;
6127 UnlockBufHdr(buf, buf_state);
6128
6129 if (forget_owner)
6132
6134
6135 /*
6136 * Support LockBufferForCleanup()
6137 *
6138 * We may have just released the last pin other than the waiter's. In most
6139 * cases, this backend holds another pin on the buffer. But, if, for
6140 * example, this backend is completing an IO issued by another backend, it
6141 * may be time to wake the waiter.
6142 */
6143 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6145}
6146
6147/*
6148 * AbortBufferIO: Clean up active buffer I/O after an error.
6149 *
6150 * All LWLocks we might have held have been released,
6151 * but we haven't yet released buffer pins, so the buffer is still pinned.
6152 *
6153 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6154 * possible the error condition wasn't related to the I/O.
6155 *
6156 * Note: this does not remove the buffer I/O from the resource owner.
6157 * That's correct when we're releasing the whole resource owner, but
6158 * beware if you use this in other contexts.
6159 */
6160static void
6162{
6163 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6164 uint32 buf_state;
6165
6166 buf_state = LockBufHdr(buf_hdr);
6167 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6168
6169 if (!(buf_state & BM_VALID))
6170 {
6171 Assert(!(buf_state & BM_DIRTY));
6172 UnlockBufHdr(buf_hdr, buf_state);
6173 }
6174 else
6175 {
6176 Assert(buf_state & BM_DIRTY);
6177 UnlockBufHdr(buf_hdr, buf_state);
6178
6179 /* Issue notice if this is not the first failure... */
6180 if (buf_state & BM_IO_ERROR)
6181 {
6182 /* Buffer is pinned, so we can read tag without spinlock */
6184 (errcode(ERRCODE_IO_ERROR),
6185 errmsg("could not write block %u of %s",
6186 buf_hdr->tag.blockNum,
6188 BufTagGetForkNum(&buf_hdr->tag)).str),
6189 errdetail("Multiple failures --- write error might be permanent.")));
6190 }
6191 }
6192
6193 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6194}
6195
6196/*
6197 * Error context callback for errors occurring during shared buffer writes.
6198 */
6199static void
6201{
6202 BufferDesc *bufHdr = (BufferDesc *) arg;
6203
6204 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6205 if (bufHdr != NULL)
6206 errcontext("writing block %u of relation %s",
6207 bufHdr->tag.blockNum,
6209 BufTagGetForkNum(&bufHdr->tag)).str);
6210}
6211
6212/*
6213 * Error context callback for errors occurring during local buffer writes.
6214 */
6215static void
6217{
6218 BufferDesc *bufHdr = (BufferDesc *) arg;
6219
6220 if (bufHdr != NULL)
6221 errcontext("writing block %u of relation %s",
6222 bufHdr->tag.blockNum,
6225 BufTagGetForkNum(&bufHdr->tag)).str);
6226}
6227
6228/*
6229 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6230 */
6231static int
6232rlocator_comparator(const void *p1, const void *p2)
6233{
6234 RelFileLocator n1 = *(const RelFileLocator *) p1;
6235 RelFileLocator n2 = *(const RelFileLocator *) p2;
6236
6237 if (n1.relNumber < n2.relNumber)
6238 return -1;
6239 else if (n1.relNumber > n2.relNumber)
6240 return 1;
6241
6242 if (n1.dbOid < n2.dbOid)
6243 return -1;
6244 else if (n1.dbOid > n2.dbOid)
6245 return 1;
6246
6247 if (n1.spcOid < n2.spcOid)
6248 return -1;
6249 else if (n1.spcOid > n2.spcOid)
6250 return 1;
6251 else
6252 return 0;
6253}
6254
6255/*
6256 * Lock buffer header - set BM_LOCKED in buffer state.
6257 */
6258uint32
6260{
6261 SpinDelayStatus delayStatus;
6262 uint32 old_buf_state;
6263
6265
6266 init_local_spin_delay(&delayStatus);
6267
6268 while (true)
6269 {
6270 /* set BM_LOCKED flag */
6271 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6272 /* if it wasn't set before we're OK */
6273 if (!(old_buf_state & BM_LOCKED))
6274 break;
6275 perform_spin_delay(&delayStatus);
6276 }
6277 finish_spin_delay(&delayStatus);
6278 return old_buf_state | BM_LOCKED;
6279}
6280
6281/*
6282 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6283 * state at that point.
6284 *
6285 * Obviously the buffer could be locked by the time the value is returned, so
6286 * this is primarily useful in CAS style loops.
6287 */
6288static uint32
6290{
6291 SpinDelayStatus delayStatus;
6292 uint32 buf_state;
6293
6294 init_local_spin_delay(&delayStatus);
6295
6296 buf_state = pg_atomic_read_u32(&buf->state);
6297
6298 while (buf_state & BM_LOCKED)
6299 {
6300 perform_spin_delay(&delayStatus);
6301 buf_state = pg_atomic_read_u32(&buf->state);
6302 }
6303
6304 finish_spin_delay(&delayStatus);
6305
6306 return buf_state;
6307}
6308
6309/*
6310 * BufferTag comparator.
6311 */
6312static inline int
6314{
6315 int ret;
6316 RelFileLocator rlocatora;
6317 RelFileLocator rlocatorb;
6318
6319 rlocatora = BufTagGetRelFileLocator(ba);
6320 rlocatorb = BufTagGetRelFileLocator(bb);
6321
6322 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6323
6324 if (ret != 0)
6325 return ret;
6326
6327 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6328 return -1;
6329 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6330 return 1;
6331
6332 if (ba->blockNum < bb->blockNum)
6333 return -1;
6334 if (ba->blockNum > bb->blockNum)
6335 return 1;
6336
6337 return 0;
6338}
6339
6340/*
6341 * Comparator determining the writeout order in a checkpoint.
6342 *
6343 * It is important that tablespaces are compared first, the logic balancing
6344 * writes between tablespaces relies on it.
6345 */
6346static inline int
6348{
6349 /* compare tablespace */
6350 if (a->tsId < b->tsId)
6351 return -1;
6352 else if (a->tsId > b->tsId)
6353 return 1;
6354 /* compare relation */
6355 if (a->relNumber < b->relNumber)
6356 return -1;
6357 else if (a->relNumber > b->relNumber)
6358 return 1;
6359 /* compare fork */
6360 else if (a->forkNum < b->forkNum)
6361 return -1;
6362 else if (a->forkNum > b->forkNum)
6363 return 1;
6364 /* compare block number */
6365 else if (a->blockNum < b->blockNum)
6366 return -1;
6367 else if (a->blockNum > b->blockNum)
6368 return 1;
6369 /* equal page IDs are unlikely, but not impossible */
6370 return 0;
6371}
6372
6373/*
6374 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6375 * progress.
6376 */
6377static int
6379{
6381 CkptTsStatus *sb = (CkptTsStatus *) b;
6382
6383 /* we want a min-heap, so return 1 for the a < b */
6384 if (sa->progress < sb->progress)
6385 return 1;
6386 else if (sa->progress == sb->progress)
6387 return 0;
6388 else
6389 return -1;
6390}
6391
6392/*
6393 * Initialize a writeback context, discarding potential previous state.
6394 *
6395 * *max_pending is a pointer instead of an immediate value, so the coalesce
6396 * limits can easily changed by the GUC mechanism, and so calling code does
6397 * not have to check the current configuration. A value of 0 means that no
6398 * writeback control will be performed.
6399 */
6400void
6401WritebackContextInit(WritebackContext *context, int *max_pending)
6402{
6403 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6404
6405 context->max_pending = max_pending;
6406 context->nr_pending = 0;
6407}
6408
6409/*
6410 * Add buffer to list of pending writeback requests.
6411 */
6412void
6414 BufferTag *tag)
6415{
6416 PendingWriteback *pending;
6417
6418 /*
6419 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6420 * point in tracking in that case.
6421 */
6423 !enableFsync)
6424 return;
6425
6426 /*
6427 * Add buffer to the pending writeback array, unless writeback control is
6428 * disabled.
6429 */
6430 if (*wb_context->max_pending > 0)
6431 {
6433
6434 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6435
6436 pending->tag = *tag;
6437 }
6438
6439 /*
6440 * Perform pending flushes if the writeback limit is exceeded. This
6441 * includes the case where previously an item has been added, but control
6442 * is now disabled.
6443 */
6444 if (wb_context->nr_pending >= *wb_context->max_pending)
6445 IssuePendingWritebacks(wb_context, io_context);
6446}
6447
6448#define ST_SORT sort_pending_writebacks
6449#define ST_ELEMENT_TYPE PendingWriteback
6450#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6451#define ST_SCOPE static
6452#define ST_DEFINE
6453#include "lib/sort_template.h"
6454
6455/*
6456 * Issue all pending writeback requests, previously scheduled with
6457 * ScheduleBufferTagForWriteback, to the OS.
6458 *
6459 * Because this is only used to improve the OSs IO scheduling we try to never
6460 * error out - it's just a hint.
6461 */
6462void
6464{
6465 instr_time io_start;
6466 int i;
6467
6468 if (wb_context->nr_pending == 0)
6469 return;
6470
6471 /*
6472 * Executing the writes in-order can make them a lot faster, and allows to
6473 * merge writeback requests to consecutive blocks into larger writebacks.
6474 */
6475 sort_pending_writebacks(wb_context->pending_writebacks,
6476 wb_context->nr_pending);
6477
6479
6480 /*
6481 * Coalesce neighbouring writes, but nothing else. For that we iterate
6482 * through the, now sorted, array of pending flushes, and look forward to
6483 * find all neighbouring (or identical) writes.
6484 */
6485 for (i = 0; i < wb_context->nr_pending; i++)
6486 {
6489 SMgrRelation reln;
6490 int ahead;
6491 BufferTag tag;
6492 RelFileLocator currlocator;
6493 Size nblocks = 1;
6494
6495 cur = &wb_context->pending_writebacks[i];
6496 tag = cur->tag;
6497 currlocator = BufTagGetRelFileLocator(&tag);
6498
6499 /*
6500 * Peek ahead, into following writeback requests, to see if they can
6501 * be combined with the current one.
6502 */
6503 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6504 {
6505
6506 next = &wb_context->pending_writebacks[i + ahead + 1];
6507
6508 /* different file, stop */
6509 if (!RelFileLocatorEquals(currlocator,
6510 BufTagGetRelFileLocator(&next->tag)) ||
6511 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6512 break;
6513
6514 /* ok, block queued twice, skip */
6515 if (cur->tag.blockNum == next->tag.blockNum)
6516 continue;
6517
6518 /* only merge consecutive writes */
6519 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6520 break;
6521
6522 nblocks++;
6523 cur = next;
6524 }
6525
6526 i += ahead;
6527
6528 /* and finally tell the kernel to write the data to storage */
6529 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6530 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6531 }
6532
6533 /*
6534 * Assume that writeback requests are only issued for buffers containing
6535 * blocks of permanent relations.
6536 */
6538 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6539
6540 wb_context->nr_pending = 0;
6541}
6542
6543/* ResourceOwner callbacks */
6544
6545static void
6547{
6549
6551}
6552
6553static char *
6555{
6557
6558 return psprintf("lost track of buffer IO on buffer %d", buffer);
6559}
6560
6561static void
6563{
6565
6566 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6567 if (!BufferIsValid(buffer))
6568 elog(ERROR, "bad buffer ID: %d", buffer);
6569
6570 if (BufferIsLocal(buffer))
6572 else
6574}
6575
6576static char *
6578{
6580}
6581
6582/*
6583 * Helper function to evict unpinned buffer whose buffer header lock is
6584 * already acquired.
6585 */
6586static bool
6587EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6588{
6589 uint32 buf_state;
6590 bool result;
6591
6592 *buffer_flushed = false;
6593
6594 buf_state = pg_atomic_read_u32(&(desc->state));
6595 Assert(buf_state & BM_LOCKED);
6596
6597 if ((buf_state & BM_VALID) == 0)
6598 {
6599 UnlockBufHdr(desc, buf_state);
6600 return false;
6601 }
6602
6603 /* Check that it's not pinned already. */
6604 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6605 {
6606 UnlockBufHdr(desc, buf_state);
6607 return false;
6608 }
6609
6610 PinBuffer_Locked(desc); /* releases spinlock */
6611
6612 /* If it was dirty, try to clean it once. */
6613 if (buf_state & BM_DIRTY)
6614 {
6617 *buffer_flushed = true;
6619 }
6620
6621 /* This will return false if it becomes dirty or someone else pins it. */
6622 result = InvalidateVictimBuffer(desc);
6623
6624 UnpinBuffer(desc);
6625
6626 return result;
6627}
6628
6629/*
6630 * Try to evict the current block in a shared buffer.
6631 *
6632 * This function is intended for testing/development use only!
6633 *
6634 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6635 * particular block in mind, it might already have been replaced by some other
6636 * block by the time this function runs. It's also unpinned on return, so the
6637 * buffer might be occupied again by the time control is returned, potentially
6638 * even by the same block. This inherent raciness without other interlocking
6639 * makes the function unsuitable for non-testing usage.
6640 *
6641 * *buffer_flushed is set to true if the buffer was dirty and has been
6642 * flushed, false otherwise. However, *buffer_flushed=true does not
6643 * necessarily mean that we flushed the buffer, it could have been flushed by
6644 * someone else.
6645 *
6646 * Returns true if the buffer was valid and it has now been made invalid.
6647 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6648 * or if the buffer becomes dirty again while we're trying to write it out.
6649 */
6650bool
6651EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6652{
6653 BufferDesc *desc;
6654
6656
6657 /* Make sure we can pin the buffer. */
6660
6661 desc = GetBufferDescriptor(buf - 1);
6662 LockBufHdr(desc);
6663
6664 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6665}
6666
6667/*
6668 * Try to evict all the shared buffers.
6669 *
6670 * This function is intended for testing/development use only! See
6671 * EvictUnpinnedBuffer().
6672 *
6673 * The buffers_* parameters are mandatory and indicate the total count of
6674 * buffers that:
6675 * - buffers_evicted - were evicted
6676 * - buffers_flushed - were flushed
6677 * - buffers_skipped - could not be evicted
6678 */
6679void
6680EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6681 int32 *buffers_skipped)
6682{
6683 *buffers_evicted = 0;
6684 *buffers_skipped = 0;
6685 *buffers_flushed = 0;
6686
6687 for (int buf = 1; buf <= NBuffers; buf++)
6688 {
6689 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6690 uint32 buf_state;
6691 bool buffer_flushed;
6692
6693 buf_state = pg_atomic_read_u32(&desc->state);
6694 if (!(buf_state & BM_VALID))
6695 continue;
6696
6699
6700 LockBufHdr(desc);
6701
6702 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6703 (*buffers_evicted)++;
6704 else
6705 (*buffers_skipped)++;
6706
6707 if (buffer_flushed)
6708 (*buffers_flushed)++;
6709 }
6710}
6711
6712/*
6713 * Try to evict all the shared buffers containing provided relation's pages.
6714 *
6715 * This function is intended for testing/development use only! See
6716 * EvictUnpinnedBuffer().
6717 *
6718 * The caller must hold at least AccessShareLock on the relation to prevent
6719 * the relation from being dropped.
6720 *
6721 * The buffers_* parameters are mandatory and indicate the total count of
6722 * buffers that:
6723 * - buffers_evicted - were evicted
6724 * - buffers_flushed - were flushed
6725 * - buffers_skipped - could not be evicted
6726 */
6727void
6729 int32 *buffers_flushed, int32 *buffers_skipped)
6730{
6732
6733 *buffers_skipped = 0;
6734 *buffers_evicted = 0;
6735 *buffers_flushed = 0;
6736
6737 for (int buf = 1; buf <= NBuffers; buf++)
6738 {
6739 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6740 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6741 bool buffer_flushed;
6742
6743 /* An unlocked precheck should be safe and saves some cycles. */
6744 if ((buf_state & BM_VALID) == 0 ||
6746 continue;
6747
6748 /* Make sure we can pin the buffer. */
6751
6752 buf_state = LockBufHdr(desc);
6753
6754 /* recheck, could have changed without the lock */
6755 if ((buf_state & BM_VALID) == 0 ||
6757 {
6758 UnlockBufHdr(desc, buf_state);
6759 continue;
6760 }
6761
6762 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6763 (*buffers_evicted)++;
6764 else
6765 (*buffers_skipped)++;
6766
6767 if (buffer_flushed)
6768 (*buffers_flushed)++;
6769 }
6770}
6771
6772/*
6773 * Generic implementation of the AIO handle staging callback for readv/writev
6774 * on local/shared buffers.
6775 *
6776 * Each readv/writev can target multiple buffers. The buffers have already
6777 * been registered with the IO handle.
6778 *
6779 * To make the IO ready for execution ("staging"), we need to ensure that the
6780 * targeted buffers are in an appropriate state while the IO is ongoing. For
6781 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6782 * in this backend could lead to this backend's buffer pin being released as
6783 * part of error handling, which in turn could lead to the buffer being
6784 * replaced while IO is ongoing.
6785 */
6787buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6788{
6789 uint64 *io_data;
6790 uint8 handle_data_len;
6791 PgAioWaitRef io_ref;
6793
6794 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6795
6796 pgaio_io_get_wref(ioh, &io_ref);
6797
6798 /* iterate over all buffers affected by the vectored readv/writev */
6799 for (int i = 0; i < handle_data_len; i++)
6800 {
6801 Buffer buffer = (Buffer) io_data[i];
6802 BufferDesc *buf_hdr = is_temp ?
6805 uint32 buf_state;
6806
6807 /*
6808 * Check that all the buffers are actually ones that could conceivably
6809 * be done in one IO, i.e. are sequential. This is the last
6810 * buffer-aware code before IO is actually executed and confusion
6811 * about which buffers are targeted by IO can be hard to debug, making
6812 * it worth doing extra-paranoid checks.
6813 */
6814 if (i == 0)
6815 first = buf_hdr->tag;
6816 else
6817 {
6818 Assert(buf_hdr->tag.relNumber == first.relNumber);
6819 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6820 }
6821
6822 if (is_temp)
6823 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6824 else
6825 buf_state = LockBufHdr(buf_hdr);
6826
6827 /* verify the buffer is in the expected state */
6828 Assert(buf_state & BM_TAG_VALID);
6829 if (is_write)
6830 {
6831 Assert(buf_state & BM_VALID);
6832 Assert(buf_state & BM_DIRTY);
6833 }
6834 else
6835 {
6836 Assert(!(buf_state & BM_VALID));
6837 Assert(!(buf_state & BM_DIRTY));
6838 }
6839
6840 /* temp buffers don't use BM_IO_IN_PROGRESS */
6841 if (!is_temp)
6842 Assert(buf_state & BM_IO_IN_PROGRESS);
6843
6844 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6845
6846 /*
6847 * Reflect that the buffer is now owned by the AIO subsystem.
6848 *
6849 * For local buffers: This can't be done just via LocalRefCount, as
6850 * one might initially think, as this backend could error out while
6851 * AIO is still in progress, releasing all the pins by the backend
6852 * itself.
6853 *
6854 * This pin is released again in TerminateBufferIO().
6855 */
6856 buf_state += BUF_REFCOUNT_ONE;
6857 buf_hdr->io_wref = io_ref;
6858
6859 if (is_temp)
6860 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6861 else
6862 UnlockBufHdr(buf_hdr, buf_state);
6863
6864 /*
6865 * Ensure the content lock that prevents buffer modifications while
6866 * the buffer is being written out is not released early due to an
6867 * error.
6868 */
6869 if (is_write && !is_temp)
6870 {
6871 LWLock *content_lock;
6872
6873 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6874
6875 Assert(LWLockHeldByMe(content_lock));
6876
6877 /*
6878 * Lock is now owned by AIO subsystem.
6879 */
6880 LWLockDisown(content_lock);
6881 }
6882
6883 /*
6884 * Stop tracking this buffer via the resowner - the AIO system now
6885 * keeps track.
6886 */
6887 if (!is_temp)
6889 }
6890}
6891
6892/*
6893 * Decode readv errors as encoded by buffer_readv_encode_error().
6894 */
6895static inline void
6897 bool *zeroed_any,
6898 bool *ignored_any,
6899 uint8 *zeroed_or_error_count,
6900 uint8 *checkfail_count,
6901 uint8 *first_off)
6902{
6903 uint32 rem_error = result.error_data;
6904
6905 /* see static asserts in buffer_readv_encode_error */
6906#define READV_COUNT_BITS 7
6907#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6908
6909 *zeroed_any = rem_error & 1;
6910 rem_error >>= 1;
6911
6912 *ignored_any = rem_error & 1;
6913 rem_error >>= 1;
6914
6915 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6916 rem_error >>= READV_COUNT_BITS;
6917
6918 *checkfail_count = rem_error & READV_COUNT_MASK;
6919 rem_error >>= READV_COUNT_BITS;
6920
6921 *first_off = rem_error & READV_COUNT_MASK;
6922 rem_error >>= READV_COUNT_BITS;
6923}
6924
6925/*
6926 * Helper to encode errors for buffer_readv_complete()
6927 *
6928 * Errors are encoded as follows:
6929 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6930 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6931 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6932 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6933 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6934 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6935 * checksum
6936 */
6937static inline void
6939 bool is_temp,
6940 bool zeroed_any,
6941 bool ignored_any,
6942 uint8 error_count,
6943 uint8 zeroed_count,
6944 uint8 checkfail_count,
6945 uint8 first_error_off,
6946 uint8 first_zeroed_off,
6947 uint8 first_ignored_off)
6948{
6949
6950 uint8 shift = 0;
6951 uint8 zeroed_or_error_count =
6952 error_count > 0 ? error_count : zeroed_count;
6953 uint8 first_off;
6954
6956 "PG_IOV_MAX is bigger than reserved space for error data");
6958 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6959
6960 /*
6961 * We only have space to encode one offset - but luckily that's good
6962 * enough. If there is an error, the error is the interesting offset, same
6963 * with a zeroed buffer vs an ignored buffer.
6964 */
6965 if (error_count > 0)
6966 first_off = first_error_off;
6967 else if (zeroed_count > 0)
6968 first_off = first_zeroed_off;
6969 else
6970 first_off = first_ignored_off;
6971
6972 Assert(!zeroed_any || error_count == 0);
6973
6974 result->error_data = 0;
6975
6976 result->error_data |= zeroed_any << shift;
6977 shift += 1;
6978
6979 result->error_data |= ignored_any << shift;
6980 shift += 1;
6981
6982 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6983 shift += READV_COUNT_BITS;
6984
6985 result->error_data |= ((uint32) checkfail_count) << shift;
6986 shift += READV_COUNT_BITS;
6987
6988 result->error_data |= ((uint32) first_off) << shift;
6989 shift += READV_COUNT_BITS;
6990
6991 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6993
6994 if (error_count > 0)
6995 result->status = PGAIO_RS_ERROR;
6996 else
6997 result->status = PGAIO_RS_WARNING;
6998
6999 /*
7000 * The encoding is complicated enough to warrant cross-checking it against
7001 * the decode function.
7002 */
7003#ifdef USE_ASSERT_CHECKING
7004 {
7005 bool zeroed_any_2,
7006 ignored_any_2;
7007 uint8 zeroed_or_error_count_2,
7008 checkfail_count_2,
7009 first_off_2;
7010
7012 &zeroed_any_2, &ignored_any_2,
7013 &zeroed_or_error_count_2,
7014 &checkfail_count_2,
7015 &first_off_2);
7016 Assert(zeroed_any == zeroed_any_2);
7017 Assert(ignored_any == ignored_any_2);
7018 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7019 Assert(checkfail_count == checkfail_count_2);
7020 Assert(first_off == first_off_2);
7021 }
7022#endif
7023
7024#undef READV_COUNT_BITS
7025#undef READV_COUNT_MASK
7026}
7027
7028/*
7029 * Helper for AIO readv completion callbacks, supporting both shared and temp
7030 * buffers. Gets called once for each buffer in a multi-page read.
7031 */
7034 uint8 flags, bool failed, bool is_temp,
7035 bool *buffer_invalid,
7036 bool *failed_checksum,
7037 bool *ignored_checksum,
7038 bool *zeroed_buffer)
7039{
7040 BufferDesc *buf_hdr = is_temp ?
7043 BufferTag tag = buf_hdr->tag;
7044 char *bufdata = BufferGetBlock(buffer);
7045 uint32 set_flag_bits;
7046 int piv_flags;
7047
7048 /* check that the buffer is in the expected state for a read */
7049#ifdef USE_ASSERT_CHECKING
7050 {
7051 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7052
7053 Assert(buf_state & BM_TAG_VALID);
7054 Assert(!(buf_state & BM_VALID));
7055 /* temp buffers don't use BM_IO_IN_PROGRESS */
7056 if (!is_temp)
7057 Assert(buf_state & BM_IO_IN_PROGRESS);
7058 Assert(!(buf_state & BM_DIRTY));
7059 }
7060#endif
7061
7062 *buffer_invalid = false;
7063 *failed_checksum = false;
7064 *ignored_checksum = false;
7065 *zeroed_buffer = false;
7066
7067 /*
7068 * We ask PageIsVerified() to only log the message about checksum errors,
7069 * as the completion might be run in any backend (or IO workers). We will
7070 * report checksum errors in buffer_readv_report().
7071 */
7072 piv_flags = PIV_LOG_LOG;
7073
7074 /* the local zero_damaged_pages may differ from the definer's */
7076 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7077
7078 /* Check for garbage data. */
7079 if (!failed)
7080 {
7081 /*
7082 * If the buffer is not currently pinned by this backend, e.g. because
7083 * we're completing this IO after an error, the buffer data will have
7084 * been marked as inaccessible when the buffer was unpinned. The AIO
7085 * subsystem holds a pin, but that doesn't prevent the buffer from
7086 * having been marked as inaccessible. The completion might also be
7087 * executed in a different process.
7088 */
7089#ifdef USE_VALGRIND
7090 if (!BufferIsPinned(buffer))
7091 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7092#endif
7093
7094 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7095 failed_checksum))
7096 {
7097 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7098 {
7099 memset(bufdata, 0, BLCKSZ);
7100 *zeroed_buffer = true;
7101 }
7102 else
7103 {
7104 *buffer_invalid = true;
7105 /* mark buffer as having failed */
7106 failed = true;
7107 }
7108 }
7109 else if (*failed_checksum)
7110 *ignored_checksum = true;
7111
7112 /* undo what we did above */
7113#ifdef USE_VALGRIND
7114 if (!BufferIsPinned(buffer))
7115 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7116#endif
7117
7118 /*
7119 * Immediately log a message about the invalid page, but only to the
7120 * server log. The reason to do so immediately is that this may be
7121 * executed in a different backend than the one that originated the
7122 * request. The reason to do so immediately is that the originator
7123 * might not process the query result immediately (because it is busy
7124 * doing another part of query processing) or at all (e.g. if it was
7125 * cancelled or errored out due to another IO also failing). The
7126 * definer of the IO will emit an ERROR or WARNING when processing the
7127 * IO's results
7128 *
7129 * To avoid duplicating the code to emit these log messages, we reuse
7130 * buffer_readv_report().
7131 */
7132 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7133 {
7134 PgAioResult result_one = {0};
7135
7136 buffer_readv_encode_error(&result_one, is_temp,
7137 *zeroed_buffer,
7138 *ignored_checksum,
7139 *buffer_invalid,
7140 *zeroed_buffer ? 1 : 0,
7141 *failed_checksum ? 1 : 0,
7142 buf_off, buf_off, buf_off);
7143 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7144 }
7145 }
7146
7147 /* Terminate I/O and set BM_VALID. */
7148 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7149 if (is_temp)
7150 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7151 else
7152 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7153
7154 /*
7155 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7156 * callback may not be executed in the same backend that called
7157 * BUFFER_READ_START. The alternative would be to defer calling the
7158 * tracepoint to a later point (e.g. the local completion callback for
7159 * shared buffer reads), which seems even less helpful.
7160 */
7161 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7162 tag.blockNum,
7163 tag.spcOid,
7164 tag.dbOid,
7165 tag.relNumber,
7167 false);
7168}
7169
7170/*
7171 * Perform completion handling of a single AIO read. This read may cover
7172 * multiple blocks / buffers.
7173 *
7174 * Shared between shared and local buffers, to reduce code duplication.
7175 */
7178 uint8 cb_data, bool is_temp)
7179{
7180 PgAioResult result = prior_result;
7182 uint8 first_error_off = 0;
7183 uint8 first_zeroed_off = 0;
7184 uint8 first_ignored_off = 0;
7185 uint8 error_count = 0;
7186 uint8 zeroed_count = 0;
7187 uint8 ignored_count = 0;
7188 uint8 checkfail_count = 0;
7189 uint64 *io_data;
7190 uint8 handle_data_len;
7191
7192 if (is_temp)
7193 {
7194 Assert(td->smgr.is_temp);
7196 }
7197 else
7198 Assert(!td->smgr.is_temp);
7199
7200 /*
7201 * Iterate over all the buffers affected by this IO and call the
7202 * per-buffer completion function for each buffer.
7203 */
7204 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7205 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7206 {
7207 Buffer buf = io_data[buf_off];
7208 bool failed;
7209 bool failed_verification = false;
7210 bool failed_checksum = false;
7211 bool zeroed_buffer = false;
7212 bool ignored_checksum = false;
7213
7215
7216 /*
7217 * If the entire I/O failed on a lower-level, each buffer needs to be
7218 * marked as failed. In case of a partial read, the first few buffers
7219 * may be ok.
7220 */
7221 failed =
7222 prior_result.status == PGAIO_RS_ERROR
7223 || prior_result.result <= buf_off;
7224
7225 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7226 &failed_verification,
7227 &failed_checksum,
7228 &ignored_checksum,
7229 &zeroed_buffer);
7230
7231 /*
7232 * Track information about the number of different kinds of error
7233 * conditions across all pages, as there can be multiple pages failing
7234 * verification as part of one IO.
7235 */
7236 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7237 first_error_off = buf_off;
7238 if (zeroed_buffer && zeroed_count++ == 0)
7239 first_zeroed_off = buf_off;
7240 if (ignored_checksum && ignored_count++ == 0)
7241 first_ignored_off = buf_off;
7242 if (failed_checksum)
7243 checkfail_count++;
7244 }
7245
7246 /*
7247 * If the smgr read succeeded [partially] and page verification failed for
7248 * some of the pages, adjust the IO's result state appropriately.
7249 */
7250 if (prior_result.status != PGAIO_RS_ERROR &&
7251 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7252 {
7253 buffer_readv_encode_error(&result, is_temp,
7254 zeroed_count > 0, ignored_count > 0,
7255 error_count, zeroed_count, checkfail_count,
7256 first_error_off, first_zeroed_off,
7257 first_ignored_off);
7258 pgaio_result_report(result, td, DEBUG1);
7259 }
7260
7261 /*
7262 * For shared relations this reporting is done in
7263 * shared_buffer_readv_complete_local().
7264 */
7265 if (is_temp && checkfail_count > 0)
7267 checkfail_count);
7268
7269 return result;
7270}
7271
7272/*
7273 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7274 * aio_local_buffer_readv_cb.
7275 *
7276 * The error is encoded / decoded in buffer_readv_encode_error() /
7277 * buffer_readv_decode_error().
7278 */
7279static void
7281 int elevel)
7282{
7283 int nblocks = td->smgr.nblocks;
7284 BlockNumber first = td->smgr.blockNum;
7285 BlockNumber last = first + nblocks - 1;
7286 ProcNumber errProc =
7288 RelPathStr rpath =
7289 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7290 bool zeroed_any,
7291 ignored_any;
7292 uint8 zeroed_or_error_count,
7293 checkfail_count,
7294 first_off;
7295 uint8 affected_count;
7296 const char *msg_one,
7297 *msg_mult,
7298 *det_mult,
7299 *hint_mult;
7300
7301 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7302 &zeroed_or_error_count,
7303 &checkfail_count,
7304 &first_off);
7305
7306 /*
7307 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7308 * special case, it's too irregular to be emitted the same way as the
7309 * other cases.
7310 */
7311 if (zeroed_any && ignored_any)
7312 {
7313 Assert(zeroed_any && ignored_any);
7314 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7315 Assert(result.status != PGAIO_RS_ERROR);
7316 affected_count = zeroed_or_error_count;
7317
7318 ereport(elevel,
7320 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s",
7321 affected_count, checkfail_count, first, last, rpath.str),
7322 affected_count > 1 ?
7323 errdetail("Block %u held first zeroed page.",
7324 first + first_off) : 0,
7325 errhint("See server log for details about the other %u invalid block(s).",
7326 affected_count + checkfail_count - 1));
7327 return;
7328 }
7329
7330 /*
7331 * The other messages are highly repetitive. To avoid duplicating a long
7332 * and complicated ereport(), gather the translated format strings
7333 * separately and then do one common ereport.
7334 */
7335 if (result.status == PGAIO_RS_ERROR)
7336 {
7337 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7338 affected_count = zeroed_or_error_count;
7339 msg_one = _("invalid page in block %u of relation %s");
7340 msg_mult = _("%u invalid pages among blocks %u..%u of relation %s");
7341 det_mult = _("Block %u held first invalid page.");
7342 hint_mult = _("See server log for the other %u invalid block(s).");
7343 }
7344 else if (zeroed_any && !ignored_any)
7345 {
7346 affected_count = zeroed_or_error_count;
7347 msg_one = _("invalid page in block %u of relation %s; zeroing out page");
7348 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s");
7349 det_mult = _("Block %u held first zeroed page.");
7350 hint_mult = _("See server log for the other %u zeroed block(s).");
7351 }
7352 else if (!zeroed_any && ignored_any)
7353 {
7354 affected_count = checkfail_count;
7355 msg_one = _("ignoring checksum failure in block %u of relation %s");
7356 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s");
7357 det_mult = _("Block %u held first ignored page.");
7358 hint_mult = _("See server log for the other %u ignored block(s).");
7359 }
7360 else
7362
7363 ereport(elevel,
7365 affected_count == 1 ?
7366 errmsg_internal(msg_one, first + first_off, rpath.str) :
7367 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7368 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7369 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7370}
7371
7372static void
7374{
7375 buffer_stage_common(ioh, false, false);
7376}
7377
7378static PgAioResult
7380 uint8 cb_data)
7381{
7382 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7383}
7384
7385/*
7386 * We need a backend-local completion callback for shared buffers, to be able
7387 * to report checksum errors correctly. Unfortunately that can only safely
7388 * happen if the reporting backend has previously called
7389 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7390 * the backend that started the IO. Hence this callback.
7391 */
7392static PgAioResult
7394 uint8 cb_data)
7395{
7396 bool zeroed_any,
7397 ignored_any;
7398 uint8 zeroed_or_error_count,
7399 checkfail_count,
7400 first_off;
7401
7402 if (prior_result.status == PGAIO_RS_OK)
7403 return prior_result;
7404
7405 buffer_readv_decode_error(prior_result,
7406 &zeroed_any,
7407 &ignored_any,
7408 &zeroed_or_error_count,
7409 &checkfail_count,
7410 &first_off);
7411
7412 if (checkfail_count)
7413 {
7415
7417 checkfail_count);
7418 }
7419
7420 return prior_result;
7421}
7422
7423static void
7425{
7426 buffer_stage_common(ioh, false, true);
7427}
7428
7429static PgAioResult
7431 uint8 cb_data)
7432{
7433 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7434}
7435
7436/* readv callback is passed READ_BUFFERS_* flags as callback data */
7439 .complete_shared = shared_buffer_readv_complete,
7440 /* need a local callback to report checksum failures */
7441 .complete_local = shared_buffer_readv_complete_local,
7442 .report = buffer_readv_report,
7443};
7444
7445/* readv callback is passed READ_BUFFERS_* flags as callback data */
7448
7449 /*
7450 * Note that this, in contrast to the shared_buffers case, uses
7451 * complete_local, as only the issuing backend has access to the required
7452 * datastructures. This is important in case the IO completion may be
7453 * consumed incidentally by another backend.
7454 */
7455 .complete_local = local_buffer_readv_complete,
7456 .report = buffer_readv_report,
7457};
int io_method
Definition: aio.c:77
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:889
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:173
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:882
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:354
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
bool pgaio_have_staged(void)
Definition: aio.c:1020
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:923
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:343
void pgaio_submit_staged(void)
Definition: aio.c:1036
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:909
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:242
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:199
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:349
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:410
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:295
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:239
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:147
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5654
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:5033
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5405
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4895
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6347
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7177
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2891
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:244
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4231
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1569
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4540
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3014
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7393
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1262
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1532
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:651
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4291
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6546
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7430
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1494
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6680
int io_max_combine_limit
Definition: bufmgr.c:172
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3072
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:235
bool zero_damaged_pages
Definition: bufmgr.c:144
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3183
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6728
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:7033
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6289
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6313
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5913
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6554
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:858
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3996
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6161
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7437
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:890
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1193
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1598
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1031
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:2005
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4065
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1556
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5247
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4663
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6232
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:922
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7446
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2282
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4047
int io_combine_limit_guc
Definition: bufmgr.c:171
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6378
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4252
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6787
#define BUF_REUSABLE
Definition: bufmgr.c:81
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6216
static void BufferSync(int flags)
Definition: bufmgr.c:3349
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1769
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7424
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4174
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6577
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4217
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2919
static uint32 MaxProportionalPins
Definition: bufmgr.c:221
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2610
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3625
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3229
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4463
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7373
void UnlockBuffers(void)
Definition: bufmgr.c:5579
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:561
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7379
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2350
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5633
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4431
int bgwriter_flush_after
Definition: bufmgr.c:179
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5373
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4834
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4493
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5829
int checkpoint_flush_after
Definition: bufmgr.c:178
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5390
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1110
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6102
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3273
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6200
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6413
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1637
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6401
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2952
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6587
int backend_flush_after
Definition: bufmgr.c:180
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2548
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7280
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2566
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5687
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5607
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5437
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4943
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6463
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6651
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:842
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:682
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
int maintenance_io_concurrency
Definition: bufmgr.c:162
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3264
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5311
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2183
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5133
int effective_io_concurrency
Definition: bufmgr.c:155
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6045
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1513
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6259
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6562
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6896
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:170
void InitBufferManagerAccess(void)
Definition: bufmgr.c:4013
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6938
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3923
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2522
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:758
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5855
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
uint32 GetPinLimit(void)
Definition: bufmgr.c:2510
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5966
#define BUF_WRITTEN
Definition: bufmgr.c:80
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5353
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define P_NEW
Definition: bufmgr.h:191
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:112
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:167
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:384
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:114
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:161
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:116
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:162
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:118
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:108
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define likely(x)
Definition: c.h:346
uint8_t uint8
Definition: c.h:500
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
#define Max(x, y)
Definition: c.h:969
double float8
Definition: c.h:601
#define pg_attribute_always_inline
Definition: c.h:270
int16_t int16
Definition: c.h:497
int32_t int32
Definition: c.h:498
uint64_t uint64
Definition: c.h:503
#define pg_unreachable()
Definition: c.h:332
#define unlikely(x)
Definition: c.h:347
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
#define MemSet(start, val, len)
Definition: c.h:991
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:909
size_t Size
Definition: c.h:576
bool IsCatalogRelationOid(Oid relid)
Definition: catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition: catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:773
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:956
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1421
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1386
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1231
int errdetail(const char *fmt,...)
Definition: elog.c:1204
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1340
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:197
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:723
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:800
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:840
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:143
bool enableFsync
Definition: globals.c:130
ProcNumber MyProcNumber
Definition: globals.c:91
int VacuumCostPageMiss
Definition: globals.c:153
bool VacuumCostActive
Definition: globals.c:159
int VacuumCostBalance
Definition: globals.c:158
int MaxBackends
Definition: globals.c:147
int VacuumCostPageDirty
Definition: globals.c:154
int VacuumCostPageHit
Definition: globals.c:152
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:48
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:832
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:993
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:663
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1004
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:796
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:693
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
int NLocBuffer
Definition: localbuf.c:44
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:839
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1985
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1891
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2029
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1353
void ForEachLWLockHeldByMe(void(*callback)(LWLock *, LWLockMode, void *), void *context)
Definition: lwlock.c:1970
LWLockMode
Definition: lwlock.h:113
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2170
void pfree(void *pointer)
Definition: mcxt.c:2150
void * palloc(Size size)
Definition: mcxt.c:1943
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:41
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:273
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:275
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:704
IOContext
Definition: pgstat.h:282
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_EXTEND
Definition: pgstat.h:311
@ IOOP_READ
Definition: pgstat.h:312
@ IOOP_WRITEBACK
Definition: pgstat.h:308
@ IOOP_HIT
Definition: pgstat.h:306
@ IOOP_EVICT
Definition: pgstat.h:304
@ IOOP_REUSE
Definition: pgstat.h:307
@ IOOP_WRITE
Definition: pgstat.h:313
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:709
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
#define NUM_AUXILIARY_PROCS
Definition: proc.h:455
#define DELAY_CHKPT_START
Definition: proc.h:128
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:498
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:48
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:423
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:371
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:740
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:578
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:648
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:669
#define RelationIsValid(relation)
Definition: rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:452
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:751
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1987
PGPROC * MyProc
Definition: proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:767
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:755
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1975
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
struct SMgrRelationData * smgr
Definition: bufmgr.h:104
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
struct ErrorContextCallback * previous
Definition: elog.h:296
void(* callback)(void *arg)
Definition: elog.h:297
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:249
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:239
PgStat_Counter maxwritten_clean
Definition: pgstat.h:240
PgStat_Counter buf_alloc
Definition: pgstat.h:241
PgStat_Counter buffers_written
Definition: pgstat.h:263
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:127
PgAioWaitRef io_wref
Definition: bufmgr.h:140
Buffer * buffers
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:128
BlockNumber blocknum
Definition: bufmgr.h:136
PgAioReturn io_return
Definition: bufmgr.h:141
struct SMgrRelationData * smgr
Definition: bufmgr.h:125
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1828
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
BlockNumber nblocks
Definition: aio_types.h:67
struct PgAioTargetData::@124 smgr
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:170
bool RecoveryInProgress(void)
Definition: xlog.c:6522
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3254
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2923
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
#define InHotStandby
Definition: xlogutils.h:60