#include "replication/walsender.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/fd.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
+#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
+#include "storage/sync.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/memutils.h"
if (ArchiveRecoveryRequested && IsUnderPostmaster)
{
PublishStartupProcessInformation();
- SetForwardFsyncRequests();
+ EnableSyncRequestForwarding();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
* the REDO pointer. Note that smgr must not do anything that'd have to
* be undone if we decide no checkpoint is needed.
*/
- smgrpreckpt();
+ SyncPreCheckpoint();
/* Begin filling in the checkpoint WAL record */
MemSet(&checkPoint, 0, sizeof(checkPoint));
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
- smgrpostckpt();
+ SyncPostCheckpoint();
/*
* Update the average distance between checkpoints if the prior checkpoint
#include "storage/fd.h"
#include "storage/lmgr.h"
#include "storage/ipc.h"
+#include "storage/md.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "utils/acl.h"
* worse, it will delete files that belong to a newly created database
* with the same OID.
*/
- ForgetDatabaseFsyncRequests(db_id);
+ ForgetDatabaseSyncRequests(db_id);
/*
* Force a checkpoint to make sure the checkpointer has received the
- * message sent by ForgetDatabaseFsyncRequests. On Windows, this also
+ * message sent by ForgetDatabaseSyncRequests. On Windows, this also
* ensures that background procs don't hold any open files, which would
* cause rmdir() to fail.
*/
DropDatabaseBuffers(xlrec->db_id);
/* Also, clean out any fsync requests that might be pending in md.c */
- ForgetDatabaseFsyncRequests(xlrec->db_id);
+ ForgetDatabaseSyncRequests(xlrec->db_id);
/* Clean out the xlog relcache too */
XLogDropDatabase(xlrec->db_id);
*/
typedef struct
{
- RelFileNode rnode;
- ForkNumber forknum;
- BlockNumber segno; /* see md.c for special values */
- /* might add a real request-type field later; not needed yet */
+ SyncRequestType type; /* request type */
+ FileTag ftag; /* file identifier */
} CheckpointerRequest;
typedef struct
/*
* Process any requests or signals received recently.
*/
- AbsorbFsyncRequests();
+ AbsorbSyncRequests();
if (got_SIGHUP)
{
UpdateSharedMemoryConfig();
}
- AbsorbFsyncRequests();
+ AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
CheckArchiveTimeout();
* operations even when we don't sleep, to prevent overflow of the
* fsync request queue.
*/
- AbsorbFsyncRequests();
+ AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
}
}
}
/*
- * ForwardFsyncRequest
+ * ForwardSyncRequest
* Forward a file-fsync request from a backend to the checkpointer
*
* Whenever a backend is compelled to write directly to a relation
* is dirty and must be fsync'd before next checkpoint. We also use this
* opportunity to count such writes for statistical purposes.
*
- * This functionality is only supported for regular (not backend-local)
- * relations, so the rnode argument is intentionally RelFileNode not
- * RelFileNodeBackend.
- *
- * segno specifies which segment (not block!) of the relation needs to be
- * fsync'd. (Since the valid range is much less than BlockNumber, we can
- * use high values for special flags; that's all internal to md.c, which
- * see for details.)
- *
* To avoid holding the lock for longer than necessary, we normally write
* to the requests[] queue without checking for duplicates. The checkpointer
* will have to eliminate dups internally anyway. However, if we discover
* let the backend know by returning false.
*/
bool
-ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
{
CheckpointerRequest *request;
bool too_full;
return false; /* probably shouldn't even get here */
if (AmCheckpointerProcess())
- elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
+ elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
/* OK, insert request */
request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
- request->rnode = rnode;
- request->forknum = forknum;
- request->segno = segno;
+ request->ftag = *ftag;
+ request->type = type;
/* If queue is more than half full, nudge the checkpointer to empty it */
too_full = (CheckpointerShmem->num_requests >=
}
/*
- * AbsorbFsyncRequests
- * Retrieve queued fsync requests and pass them to local smgr.
+ * AbsorbSyncRequests
+ * Retrieve queued sync requests and pass them to sync mechanism.
*
* This is exported because it must be called during CreateCheckPoint;
* we have to be sure we have accepted all pending requests just before
* non-checkpointer processes, do nothing if not checkpointer.
*/
void
-AbsorbFsyncRequests(void)
+AbsorbSyncRequests(void)
{
CheckpointerRequest *requests = NULL;
CheckpointerRequest *request;
LWLockRelease(CheckpointerCommLock);
for (request = requests; n > 0; request++, n--)
- RememberFsyncRequest(request->rnode, request->forknum, request->segno);
+ RememberSyncRequest(&request->ftag, request->type);
END_CRIT_SECTION();
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = buffer file freespace ipc large_object lmgr page smgr
+SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync
include $(top_srcdir)/src/backend/common.mk
BufferSync(flags);
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
- smgrsync();
+ ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}
#include "access/xlogutils.h"
#include "access/xlog.h"
#include "pgstat.h"
-#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/bufmgr.h"
+#include "storage/md.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
+#include "storage/sync.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "pg_trace.h"
-
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB 10
-#define UNLINKS_PER_ABSORB 10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
-
-/*
- * On Windows, we have to interpret EACCES as possibly meaning the same as
- * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
- * that's what you get. Ugh. This code is designed so that we don't
- * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
- */
-#ifndef WIN32
-#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
-#else
-#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
-#endif
-
/*
* The magnetic disk storage manager keeps track of open file
* descriptors in its own descriptor pool. This is done to make it
static MemoryContext MdCxt; /* context for all MdfdVec objects */
-/*
- * In some contexts (currently, standalone backends and the checkpointer)
- * we keep track of pending fsync operations: we need to remember all relation
- * segments that have been written since the last checkpoint, so that we can
- * fsync them down to disk before completing the next checkpoint. This hash
- * table remembers the pending operations. We use a hash table mostly as
- * a convenient way of merging duplicate requests.
- *
- * We use a similar mechanism to remember no-longer-needed files that can
- * be deleted after the next checkpoint, but we use a linked list instead of
- * a hash table, because we don't expect there to be any duplicate requests.
- *
- * These mechanisms are only used for non-temp relations; we never fsync
- * temp rels, nor do we need to postpone their deletion (see comments in
- * mdunlink).
- *
- * (Regular backends do not track pending operations locally, but forward
- * them to the checkpointer.)
- */
-typedef uint16 CycleCtr; /* can be any convenient integer size */
-
-typedef struct
-{
- RelFileNode rnode; /* hash table key (must be first!) */
- CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */
- /* requests[f] has bit n set if we need to fsync segment n of fork f */
- Bitmapset *requests[MAX_FORKNUM + 1];
- /* canceled[f] is true if we canceled fsyncs for fork "recently" */
- bool canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
- RelFileNode rnode; /* the dead relation to delete */
- CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
-
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above */
-
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
+/* Populate a file tag describing an md.c segment file. */
+#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
+( \
+ memset(&(a), 0, sizeof(FileTag)), \
+ (a).handler = SYNC_HANDLER_MD, \
+ (a).rnode = (xx_rnode), \
+ (a).forknum = (xx_forknum), \
+ (a).segno = (xx_segno) \
+)
/*** behavior for mdopen & _mdfd_getseg ***/
static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
+static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+ BlockNumber segno);
+static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+ BlockNumber segno);
static void _fdvec_resize(SMgrRelation reln,
ForkNumber forknum,
int nseg);
MdCxt = AllocSetContextCreate(TopMemoryContext,
"MdSmgr",
ALLOCSET_DEFAULT_SIZES);
-
- /*
- * Create pending-operations hashtable if we need it. Currently, we need
- * it if we are standalone (not under a postmaster) or if we are a startup
- * or checkpointer auxiliary process.
- */
- if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
- {
- HASHCTL hash_ctl;
-
- /*
- * XXX: The checkpointer needs to add entries to the pending ops table
- * when absorbing fsync requests. That is done within a critical
- * section, which isn't usually allowed, but we make an exception. It
- * means that there's a theoretical possibility that you run out of
- * memory while absorbing fsync requests, which leads to a PANIC.
- * Fortunately the hash table is small so that's unlikely to happen in
- * practice.
- */
- pendingOpsCxt = AllocSetContextCreate(MdCxt,
- "Pending ops context",
- ALLOCSET_DEFAULT_SIZES);
- MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
- MemSet(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(RelFileNode);
- hash_ctl.entrysize = sizeof(PendingOperationEntry);
- hash_ctl.hcxt = pendingOpsCxt;
- pendingOpsTable = hash_create("Pending Ops Table",
- 100L,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
- pendingUnlinks = NIL;
- }
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process. Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
- /* Perform any pending fsyncs we may have queued up, then drop table */
- if (pendingOpsTable)
- {
- mdsync();
- hash_destroy(pendingOpsTable);
- }
- pendingOpsTable = NULL;
-
- /*
- * We should not have any pending unlink requests, since mdunlink doesn't
- * queue unlink requests when isRedo.
- */
- Assert(pendingUnlinks == NIL);
}
/*
void
mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
{
- /*
- * We have to clean out any pending fsync requests for the doomed
- * relation, else the next mdsync() will fail. There can't be any such
- * requests for a temp relation, though. We can send just one request
- * even when deleting multiple forks, since the fsync queuing code accepts
- * the "InvalidForkNumber = all forks" convention.
- */
- if (!RelFileNodeBackendIsTemp(rnode))
- ForgetRelationFsyncRequests(rnode.node, forkNum);
-
/* Now do the per-fork work */
if (forkNum == InvalidForkNumber)
{
*/
if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
{
+ /* First, forget any pending sync requests for the first segment */
+ if (!RelFileNodeBackendIsTemp(rnode))
+ register_forget_request(rnode, forkNum, 0 /* first seg */ );
+
+ /* Next unlink the file */
ret = unlink(path);
if (ret < 0 && errno != ENOENT)
ereport(WARNING,
errmsg("could not truncate file \"%s\": %m", path)));
/* Register request to unlink first segment later */
- register_unlink(rnode);
+ register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
}
/*
*/
for (segno = 1;; segno++)
{
+ /*
+ * Forget any pending sync requests for this segment before we try
+ * to unlink.
+ */
+ if (!RelFileNodeBackendIsTemp(rnode))
+ register_forget_request(rnode, forkNum, segno);
+
sprintf(segpath, "%s.%u", path, segno);
if (unlink(segpath) < 0)
{
}
}
-/*
- * mdsync() -- Sync previous writes to stable storage.
- */
-void
-mdsync(void)
-{
- static bool mdsync_in_progress = false;
-
- HASH_SEQ_STATUS hstat;
- PendingOperationEntry *entry;
- int absorb_counter;
-
- /* Statistics on sync times */
- int processed = 0;
- instr_time sync_start,
- sync_end,
- sync_diff;
- uint64 elapsed;
- uint64 longest = 0;
- uint64 total_elapsed = 0;
-
- /*
- * This is only called during checkpoints, and checkpoints should only
- * occur in processes that have created a pendingOpsTable.
- */
- if (!pendingOpsTable)
- elog(ERROR, "cannot sync without a pendingOpsTable");
-
- /*
- * If we are in the checkpointer, the sync had better include all fsync
- * requests that were queued by backends up to this point. The tightest
- * race condition that could occur is that a buffer that must be written
- * and fsync'd for the checkpoint could have been dumped by a backend just
- * before it was visited by BufferSync(). We know the backend will have
- * queued an fsync request before clearing the buffer's dirtybit, so we
- * are safe as long as we do an Absorb after completing BufferSync().
- */
- AbsorbFsyncRequests();
-
- /*
- * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
- * checkpoint), we want to ignore fsync requests that are entered into the
- * hashtable after this point --- they should be processed next time,
- * instead. We use mdsync_cycle_ctr to tell old entries apart from new
- * ones: new ones will have cycle_ctr equal to the incremented value of
- * mdsync_cycle_ctr.
- *
- * In normal circumstances, all entries present in the table at this point
- * will have cycle_ctr exactly equal to the current (about to be old)
- * value of mdsync_cycle_ctr. However, if we fail partway through the
- * fsync'ing loop, then older values of cycle_ctr might remain when we
- * come back here to try again. Repeated checkpoint failures would
- * eventually wrap the counter around to the point where an old entry
- * might appear new, causing us to skip it, possibly allowing a checkpoint
- * to succeed that should not have. To forestall wraparound, any time the
- * previous mdsync() failed to complete, run through the table and
- * forcibly set cycle_ctr = mdsync_cycle_ctr.
- *
- * Think not to merge this loop with the main loop, as the problem is
- * exactly that that loop may fail before having visited all the entries.
- * From a performance point of view it doesn't matter anyway, as this path
- * will never be taken in a system that's functioning normally.
- */
- if (mdsync_in_progress)
- {
- /* prior try failed, so update any stale cycle_ctr values */
- hash_seq_init(&hstat, pendingOpsTable);
- while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
- {
- entry->cycle_ctr = mdsync_cycle_ctr;
- }
- }
-
- /* Advance counter so that new hashtable entries are distinguishable */
- mdsync_cycle_ctr++;
-
- /* Set flag to detect failure if we don't reach the end of the loop */
- mdsync_in_progress = true;
-
- /* Now scan the hashtable for fsync requests to process */
- absorb_counter = FSYNCS_PER_ABSORB;
- hash_seq_init(&hstat, pendingOpsTable);
- while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
- {
- ForkNumber forknum;
-
- /*
- * If the entry is new then don't process it this time; it might
- * contain multiple fsync-request bits, but they are all new. Note
- * "continue" bypasses the hash-remove call at the bottom of the loop.
- */
- if (entry->cycle_ctr == mdsync_cycle_ctr)
- continue;
-
- /* Else assert we haven't missed it */
- Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
- /*
- * Scan over the forks and segments represented by the entry.
- *
- * The bitmap manipulations are slightly tricky, because we can call
- * AbsorbFsyncRequests() inside the loop and that could result in
- * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
- * So we detach it, but if we fail we'll merge it with any new
- * requests that have arrived in the meantime.
- */
- for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- {
- Bitmapset *requests = entry->requests[forknum];
- int segno;
-
- entry->requests[forknum] = NULL;
- entry->canceled[forknum] = false;
-
- segno = -1;
- while ((segno = bms_next_member(requests, segno)) >= 0)
- {
- int failures;
-
- /*
- * If fsync is off then we don't have to bother opening the
- * file at all. (We delay checking until this point so that
- * changing fsync on the fly behaves sensibly.)
- */
- if (!enableFsync)
- continue;
-
- /*
- * If in checkpointer, we want to absorb pending requests
- * every so often to prevent overflow of the fsync request
- * queue. It is unspecified whether newly-added entries will
- * be visited by hash_seq_search, but we don't care since we
- * don't need to process them anyway.
- */
- if (--absorb_counter <= 0)
- {
- AbsorbFsyncRequests();
- absorb_counter = FSYNCS_PER_ABSORB;
- }
-
- /*
- * The fsync table could contain requests to fsync segments
- * that have been deleted (unlinked) by the time we get to
- * them. Rather than just hoping an ENOENT (or EACCES on
- * Windows) error can be ignored, what we do on error is
- * absorb pending requests and then retry. Since mdunlink()
- * queues a "cancel" message before actually unlinking, the
- * fsync request is guaranteed to be marked canceled after the
- * absorb if it really was this case. DROP DATABASE likewise
- * has to tell us to forget fsync requests before it starts
- * deletions.
- */
- for (failures = 0;; failures++) /* loop exits at "break" */
- {
- SMgrRelation reln;
- MdfdVec *seg;
- char *path;
- int save_errno;
-
- /*
- * Find or create an smgr hash entry for this relation.
- * This may seem a bit unclean -- md calling smgr? But
- * it's really the best solution. It ensures that the
- * open file reference isn't permanently leaked if we get
- * an error here. (You may say "but an unreferenced
- * SMgrRelation is still a leak!" Not really, because the
- * only case in which a checkpoint is done by a process
- * that isn't about to shut down is in the checkpointer,
- * and it will periodically do smgrcloseall(). This fact
- * justifies our not closing the reln in the success path
- * either, which is a good thing since in non-checkpointer
- * cases we couldn't safely do that.)
- */
- reln = smgropen(entry->rnode, InvalidBackendId);
-
- /* Attempt to open and fsync the target segment */
- seg = _mdfd_getseg(reln, forknum,
- (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
- false,
- EXTENSION_RETURN_NULL
- | EXTENSION_DONT_CHECK_SIZE);
-
- INSTR_TIME_SET_CURRENT(sync_start);
-
- if (seg != NULL &&
- FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
- {
- /* Success; update statistics about sync timing */
- INSTR_TIME_SET_CURRENT(sync_end);
- sync_diff = sync_end;
- INSTR_TIME_SUBTRACT(sync_diff, sync_start);
- elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
- if (elapsed > longest)
- longest = elapsed;
- total_elapsed += elapsed;
- processed++;
- requests = bms_del_member(requests, segno);
- if (log_checkpoints)
- elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
- processed,
- FilePathName(seg->mdfd_vfd),
- (double) elapsed / 1000);
-
- break; /* out of retry loop */
- }
-
- /* Compute file name for use in message */
- save_errno = errno;
- path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
- errno = save_errno;
-
- /*
- * It is possible that the relation has been dropped or
- * truncated since the fsync request was entered.
- * Therefore, allow ENOENT, but only if we didn't fail
- * already on this file. This applies both for
- * _mdfd_getseg() and for FileSync, since fd.c might have
- * closed the file behind our back.
- *
- * XXX is there any point in allowing more than one retry?
- * Don't see one at the moment, but easy to change the
- * test here if so.
- */
- if (!FILE_POSSIBLY_DELETED(errno) ||
- failures > 0)
- {
- Bitmapset *new_requests;
-
- /*
- * We need to merge these unsatisfied requests with
- * any others that have arrived since we started.
- */
- new_requests = entry->requests[forknum];
- entry->requests[forknum] =
- bms_join(new_requests, requests);
-
- errno = save_errno;
- ereport(data_sync_elevel(ERROR),
- (errcode_for_file_access(),
- errmsg("could not fsync file \"%s\": %m",
- path)));
- }
- else
- ereport(DEBUG1,
- (errcode_for_file_access(),
- errmsg("could not fsync file \"%s\" but retrying: %m",
- path)));
- pfree(path);
-
- /*
- * Absorb incoming requests and check to see if a cancel
- * arrived for this relation fork.
- */
- AbsorbFsyncRequests();
- absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
- if (entry->canceled[forknum])
- break;
- } /* end retry loop */
- }
- bms_free(requests);
- }
-
- /*
- * We've finished everything that was requested before we started to
- * scan the entry. If no new requests have been inserted meanwhile,
- * remove the entry. Otherwise, update its cycle counter, as all the
- * requests now in it must have arrived during this cycle.
- */
- for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- {
- if (entry->requests[forknum] != NULL)
- break;
- }
- if (forknum <= MAX_FORKNUM)
- entry->cycle_ctr = mdsync_cycle_ctr;
- else
- {
- /* Okay to remove it */
- if (hash_search(pendingOpsTable, &entry->rnode,
- HASH_REMOVE, NULL) == NULL)
- elog(ERROR, "pendingOpsTable corrupted");
- }
- } /* end loop over hashtable entries */
-
- /* Return sync performance metrics for report at checkpoint end */
- CheckpointStats.ckpt_sync_rels = processed;
- CheckpointStats.ckpt_longest_sync = longest;
- CheckpointStats.ckpt_agg_sync_time = total_elapsed;
-
- /* Flag successful completion of mdsync */
- mdsync_in_progress = false;
-}
-
-/*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
- *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
- *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
- */
-void
-mdpreckpt(void)
-{
- /*
- * Any unlink requests arriving after this point will be assigned the next
- * cycle counter, and won't be unlinked until next checkpoint.
- */
- mdckpt_cycle_ctr++;
-}
-
-/*
- * mdpostckpt() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-mdpostckpt(void)
-{
- int absorb_counter;
-
- absorb_counter = UNLINKS_PER_ABSORB;
- while (pendingUnlinks != NIL)
- {
- PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
- char *path;
-
- /*
- * New entries are appended to the end, so if the entry is new we've
- * reached the end of old entries.
- *
- * Note: if just the right number of consecutive checkpoints fail, we
- * could be fooled here by cycle_ctr wraparound. However, the only
- * consequence is that we'd delay unlinking for one more checkpoint,
- * which is perfectly tolerable.
- */
- if (entry->cycle_ctr == mdckpt_cycle_ctr)
- break;
-
- /* Unlink the file */
- path = relpathperm(entry->rnode, MAIN_FORKNUM);
- if (unlink(path) < 0)
- {
- /*
- * There's a race condition, when the database is dropped at the
- * same time that we process the pending unlink requests. If the
- * DROP DATABASE deletes the file before we do, we will get ENOENT
- * here. rmtree() also has to ignore ENOENT errors, to deal with
- * the possibility that we delete the file first.
- */
- if (errno != ENOENT)
- ereport(WARNING,
- (errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m", path)));
- }
- pfree(path);
-
- /* And remove the list entry */
- pendingUnlinks = list_delete_first(pendingUnlinks);
- pfree(entry);
-
- /*
- * As in mdsync, we don't want to stop absorbing fsync requests for a
- * long time when there are many deletions to be done. We can safely
- * call AbsorbFsyncRequests() at this point in the loop (note it might
- * try to delete list entries).
- */
- if (--absorb_counter <= 0)
- {
- AbsorbFsyncRequests();
- absorb_counter = UNLINKS_PER_ABSORB;
- }
- }
-}
-
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
* If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later. Otherwise, try to pass off the fsync request
- * to the checkpointer process. If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
+ * ProcessSyncRequests to process later. Otherwise, try to pass off the
+ * fsync request to the checkpointer process. If that fails, just do the
+ * fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
*/
static void
register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
+ FileTag tag;
+
+ INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno);
+
/* Temp relations should never be fsync'd */
Assert(!SmgrIsTemp(reln));
- if (pendingOpsTable)
+ if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
{
- /* push it into local pending-ops table */
- RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
- }
- else
- {
- if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
- return; /* passed it off successfully */
-
ereport(DEBUG1,
(errmsg("could not forward fsync request because request queue is full")));
/*
* register_unlink() -- Schedule a file to be deleted after next checkpoint
- *
- * We don't bother passing in the fork number, because this is only used
- * with main forks.
- *
- * As with register_dirty_segment, this could involve either a local or
- * a remote pending-ops table.
*/
static void
-register_unlink(RelFileNodeBackend rnode)
+register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+ BlockNumber segno)
{
+ FileTag tag;
+
+ INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
+
/* Should never be used with temp relations */
Assert(!RelFileNodeBackendIsTemp(rnode));
- if (pendingOpsTable)
- {
- /* push it into local pending-ops table */
- RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
- UNLINK_RELATION_REQUEST);
- }
- else
- {
- /*
- * Notify the checkpointer about it. If we fail to queue the request
- * message, we have to sleep and try again, because we can't simply
- * delete the file now. Ugly, but hopefully won't happen often.
- *
- * XXX should we just leave the file orphaned instead?
- */
- Assert(IsUnderPostmaster);
- while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
- UNLINK_RELATION_REQUEST))
- pg_usleep(10000L); /* 10 msec seems a good number */
- }
+ RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
}
/*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint. UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- * either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- * checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
+ * register_forget_request() -- forget any fsyncs for a relation fork's segment
*/
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+static void
+register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+ BlockNumber segno)
{
- Assert(pendingOpsTable);
-
- if (segno == FORGET_RELATION_FSYNC)
- {
- /* Remove any pending requests for the relation (one or all forks) */
- PendingOperationEntry *entry;
-
- entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
- &rnode,
- HASH_FIND,
- NULL);
- if (entry)
- {
- /*
- * We can't just delete the entry since mdsync could have an
- * active hashtable scan. Instead we delete the bitmapsets; this
- * is safe because of the way mdsync is coded. We also set the
- * "canceled" flags so that mdsync can tell that a cancel arrived
- * for the fork(s).
- */
- if (forknum == InvalidForkNumber)
- {
- /* remove requests for all forks */
- for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- {
- bms_free(entry->requests[forknum]);
- entry->requests[forknum] = NULL;
- entry->canceled[forknum] = true;
- }
- }
- else
- {
- /* remove requests for single fork */
- bms_free(entry->requests[forknum]);
- entry->requests[forknum] = NULL;
- entry->canceled[forknum] = true;
- }
- }
- }
- else if (segno == FORGET_DATABASE_FSYNC)
- {
- /* Remove any pending requests for the entire database */
- HASH_SEQ_STATUS hstat;
- PendingOperationEntry *entry;
- ListCell *cell,
- *prev,
- *next;
-
- /* Remove fsync requests */
- hash_seq_init(&hstat, pendingOpsTable);
- while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
- {
- if (entry->rnode.dbNode == rnode.dbNode)
- {
- /* remove requests for all forks */
- for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- {
- bms_free(entry->requests[forknum]);
- entry->requests[forknum] = NULL;
- entry->canceled[forknum] = true;
- }
- }
- }
-
- /* Remove unlink requests */
- prev = NULL;
- for (cell = list_head(pendingUnlinks); cell; cell = next)
- {
- PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
- next = lnext(cell);
- if (entry->rnode.dbNode == rnode.dbNode)
- {
- pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
- pfree(entry);
- }
- else
- prev = cell;
- }
- }
- else if (segno == UNLINK_RELATION_REQUEST)
- {
- /* Unlink request: put it in the linked list */
- MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
- PendingUnlinkEntry *entry;
-
- /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
- Assert(forknum == MAIN_FORKNUM);
-
- entry = palloc(sizeof(PendingUnlinkEntry));
- entry->rnode = rnode;
- entry->cycle_ctr = mdckpt_cycle_ctr;
-
- pendingUnlinks = lappend(pendingUnlinks, entry);
-
- MemoryContextSwitchTo(oldcxt);
- }
- else
- {
- /* Normal case: enter a request to fsync this segment */
- MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
- PendingOperationEntry *entry;
- bool found;
-
- entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
- &rnode,
- HASH_ENTER,
- &found);
- /* if new entry, initialize it */
- if (!found)
- {
- entry->cycle_ctr = mdsync_cycle_ctr;
- MemSet(entry->requests, 0, sizeof(entry->requests));
- MemSet(entry->canceled, 0, sizeof(entry->canceled));
- }
-
- /*
- * NB: it's intentional that we don't change cycle_ctr if the entry
- * already exists. The cycle_ctr must represent the oldest fsync
- * request that could be in the entry.
- */
-
- entry->requests[forknum] = bms_add_member(entry->requests[forknum],
- (int) segno);
-
- MemoryContextSwitchTo(oldcxt);
- }
-}
+ FileTag tag;
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
- if (pendingOpsTable)
- {
- /* standalone backend or startup process: fsync state is local */
- RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
- }
- else if (IsUnderPostmaster)
- {
- /*
- * Notify the checkpointer about it. If we fail to queue the cancel
- * message, we have to sleep and try again ... ugly, but hopefully
- * won't happen often.
- *
- * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
- * error would leave the no-longer-used file still present on disk,
- * which would be bad, so I'm inclined to assume that the checkpointer
- * will always empty the queue soon.
- */
- while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
- pg_usleep(10000L); /* 10 msec seems a good number */
+ INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
- /*
- * Note we don't wait for the checkpointer to actually absorb the
- * cancel message; see mdsync() for the implications.
- */
- }
+ RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
}
/*
* ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
*/
void
-ForgetDatabaseFsyncRequests(Oid dbid)
+ForgetDatabaseSyncRequests(Oid dbid)
{
+ FileTag tag;
RelFileNode rnode;
rnode.dbNode = dbid;
rnode.spcNode = 0;
rnode.relNode = 0;
- if (pendingOpsTable)
- {
- /* standalone backend or startup process: fsync state is local */
- RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
- }
- else if (IsUnderPostmaster)
- {
- /* see notes in ForgetRelationFsyncRequests */
- while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
- FORGET_DATABASE_FSYNC))
- pg_usleep(10000L); /* 10 msec seems a good number */
- }
+ INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
+
+ RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
}
/*
/* note that this calculation will ignore any partial block at EOF */
return (BlockNumber) (len / BLCKSZ);
}
+
+/*
+ * Sync a file to disk, given a file tag. Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdsyncfiletag(const FileTag *ftag, char *path)
+{
+ SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+ MdfdVec *v;
+ char *p;
+
+ /* Provide the path for informational messages. */
+ p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+ strlcpy(path, p, MAXPGPATH);
+ pfree(p);
+
+ /* Try to find open the requested segment. */
+ v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false,
+ EXTENSION_RETURN_NULL);
+ if (v == NULL)
+ {
+ errno = ENOENT;
+ return -1;
+ }
+
+ /* Try to fsync the file. */
+ return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC);
+}
+
+/*
+ * Unlink a file, given a file tag. Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdunlinkfiletag(const FileTag *ftag, char *path)
+{
+ SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+ char *p;
+
+ /* Compute the path. */
+ p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+ strlcpy(path, p, MAXPGPATH);
+ pfree(p);
+
+ /* Try to unlink the file. */
+ return unlink(path);
+}
+
+/*
+ * Check if a given candidate request matches a given tag, when processing
+ * a SYNC_FILTER_REQUEST request. This will be called for all pending
+ * requests to find out whether to forget them.
+ */
+bool
+mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+ /*
+ * For now we only use filter requests as a way to drop all scheduled
+ * callbacks relating to a given database, when dropping the database.
+ * We'll return true for all candidates that have the same database OID as
+ * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
+ */
+ return ftag->rnode.dbNode == candidate->rnode.dbNode;
+}
#include "lib/ilist.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
+#include "storage/md.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
- void (*smgr_pre_ckpt) (void); /* may be NULL */
- void (*smgr_sync) (void); /* may be NULL */
- void (*smgr_post_ckpt) (void); /* may be NULL */
} f_smgr;
-
static const f_smgr smgrsw[] = {
/* magnetic disk */
{
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
- .smgr_pre_ckpt = mdpreckpt,
- .smgr_sync = mdsync,
- .smgr_post_ckpt = mdpostckpt
}
};
static const int NSmgr = lengthof(smgrsw);
-
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
* In addition, "unowned" SMgrRelation objects are chained together in a list.
smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
}
-
-/*
- * smgrpreckpt() -- Prepare for checkpoint.
- */
-void
-smgrpreckpt(void)
-{
- int i;
-
- for (i = 0; i < NSmgr; i++)
- {
- if (smgrsw[i].smgr_pre_ckpt)
- smgrsw[i].smgr_pre_ckpt();
- }
-}
-
-/*
- * smgrsync() -- Sync files to disk during checkpoint.
- */
-void
-smgrsync(void)
-{
- int i;
-
- for (i = 0; i < NSmgr; i++)
- {
- if (smgrsw[i].smgr_sync)
- smgrsw[i].smgr_sync();
- }
-}
-
-/*
- * smgrpostckpt() -- Post-checkpoint cleanup.
- */
-void
-smgrpostckpt(void)
-{
- int i;
-
- for (i = 0; i < NSmgr; i++)
- {
- if (smgrsw[i].smgr_post_ckpt)
- smgrsw[i].smgr_post_ckpt();
- }
-}
-
/*
* AtEOXact_SMgr
*
--- /dev/null
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/sync
+#
+# IDENTIFICATION
+# src/backend/storage/sync/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/sync
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = sync.o
+
+include $(top_srcdir)/src/backend/common.mk
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * sync.c
+ * File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/sync/sync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlogutils.h"
+#include "access/xlog.h"
+#include "commands/tablespace.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/inval.h"
+
+static MemoryContext pendingOpsCxt; /* context for the pending ops state */
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint. This hash
+ * table remembers the pending operations. We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr; /* can be any convenient integer size */
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
+ bool canceled; /* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbFsyncRequests */
+#define FSYNCS_PER_ABSORB 10
+#define UNLINKS_PER_ABSORB 10
+
+/*
+ * Function pointers for handling sync and unlink requests.
+ */
+typedef struct SyncOps
+{
+ int (*sync_syncfiletag) (const FileTag *ftag, char *path);
+ int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+ bool (*sync_filetagmatches) (const FileTag *ftag,
+ const FileTag *candidate);
+} SyncOps;
+
+static const SyncOps syncsw[] = {
+ /* magnetic disk */
+ {
+ .sync_syncfiletag = mdsyncfiletag,
+ .sync_unlinkfiletag = mdunlinkfiletag,
+ .sync_filetagmatches = mdfiletagmatches
+ }
+};
+
+/*
+ * Initialize data structures for the file sync tracking.
+ */
+void
+InitSync(void)
+{
+ /*
+ * Create pending-operations hashtable if we need it. Currently, we need
+ * it if we are standalone (not under a postmaster) or if we are a startup
+ * or checkpointer auxiliary process.
+ */
+ if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+ {
+ HASHCTL hash_ctl;
+
+ /*
+ * XXX: The checkpointer needs to add entries to the pending ops table
+ * when absorbing fsync requests. That is done within a critical
+ * section, which isn't usually allowed, but we make an exception. It
+ * means that there's a theoretical possibility that you run out of
+ * memory while absorbing fsync requests, which leads to a PANIC.
+ * Fortunately the hash table is small so that's unlikely to happen in
+ * practice.
+ */
+ pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+ "Pending ops context",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+ MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(FileTag);
+ hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+ hash_ctl.hcxt = pendingOpsCxt;
+ pendingOps = hash_create("Pending Ops Table",
+ 100L,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ pendingUnlinks = NIL;
+ }
+
+}
+
+/*
+ * SyncPreCheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+SyncPreCheckpoint(void)
+{
+ /*
+ * Any unlink requests arriving after this point will be assigned the next
+ * cycle counter, and won't be unlinked until next checkpoint.
+ */
+ checkpoint_cycle_ctr++;
+}
+
+/*
+ * SyncPostCheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+SyncPostCheckpoint(void)
+{
+ int absorb_counter;
+
+ absorb_counter = UNLINKS_PER_ABSORB;
+ while (pendingUnlinks != NIL)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+ char path[MAXPGPATH];
+
+ /*
+ * New entries are appended to the end, so if the entry is new we've
+ * reached the end of old entries.
+ *
+ * Note: if just the right number of consecutive checkpoints fail, we
+ * could be fooled here by cycle_ctr wraparound. However, the only
+ * consequence is that we'd delay unlinking for one more checkpoint,
+ * which is perfectly tolerable.
+ */
+ if (entry->cycle_ctr == checkpoint_cycle_ctr)
+ break;
+
+ /* Unlink the file */
+ if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+ path) < 0)
+ {
+ /*
+ * There's a race condition, when the database is dropped at the
+ * same time that we process the pending unlink requests. If the
+ * DROP DATABASE deletes the file before we do, we will get ENOENT
+ * here. rmtree() also has to ignore ENOENT errors, to deal with
+ * the possibility that we delete the file first.
+ */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path)));
+ }
+
+ /* And remove the list entry */
+ pendingUnlinks = list_delete_first(pendingUnlinks);
+ pfree(entry);
+
+ /*
+ * As in ProcessFsyncRequests, we don't want to stop absorbing fsync
+ * requests for along time when there are many deletions to be done.
+ * We can safely call AbsorbFsyncRequests() at this point in the loop
+ * (note it might try to delete list entries).
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = UNLINKS_PER_ABSORB;
+ }
+ }
+}
+
+/*
+
+ * ProcessSyncRequests() -- Process queued fsync requests.
+ */
+void
+ProcessSyncRequests(void)
+{
+ static bool sync_in_progress = false;
+
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *entry;
+ int absorb_counter;
+
+ /* Statistics on sync times */
+ int processed = 0;
+ instr_time sync_start,
+ sync_end,
+ sync_diff;
+ uint64 elapsed;
+ uint64 longest = 0;
+ uint64 total_elapsed = 0;
+
+ /*
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOps.
+ */
+ if (!pendingOps)
+ elog(ERROR, "cannot sync without a pendingOps table");
+
+ /*
+ * If we are in the checkpointer, the sync had better include all fsync
+ * requests that were queued by backends up to this point. The tightest
+ * race condition that could occur is that a buffer that must be written
+ * and fsync'd for the checkpoint could have been dumped by a backend just
+ * before it was visited by BufferSync(). We know the backend will have
+ * queued an fsync request before clearing the buffer's dirtybit, so we
+ * are safe as long as we do an Absorb after completing BufferSync().
+ */
+ AbsorbSyncRequests();
+
+ /*
+ * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+ * checkpoint), we want to ignore fsync requests that are entered into the
+ * hashtable after this point --- they should be processed next time,
+ * instead. We use sync_cycle_ctr to tell old entries apart from new
+ * ones: new ones will have cycle_ctr equal to the incremented value of
+ * sync_cycle_ctr.
+ *
+ * In normal circumstances, all entries present in the table at this point
+ * will have cycle_ctr exactly equal to the current (about to be old)
+ * value of sync_cycle_ctr. However, if we fail partway through the
+ * fsync'ing loop, then older values of cycle_ctr might remain when we
+ * come back here to try again. Repeated checkpoint failures would
+ * eventually wrap the counter around to the point where an old entry
+ * might appear new, causing us to skip it, possibly allowing a checkpoint
+ * to succeed that should not have. To forestall wraparound, any time the
+ * previous ProcessFsyncRequests() failed to complete, run through the
+ * table and forcibly set cycle_ctr = sync_cycle_ctr.
+ *
+ * Think not to merge this loop with the main loop, as the problem is
+ * exactly that that loop may fail before having visited all the entries.
+ * From a performance point of view it doesn't matter anyway, as this path
+ * will never be taken in a system that's functioning normally.
+ */
+ if (sync_in_progress)
+ {
+ /* prior try failed, so update any stale cycle_ctr values */
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ }
+ }
+
+ /* Advance counter so that new hashtable entries are distinguishable */
+ sync_cycle_ctr++;
+
+ /* Set flag to detect failure if we don't reach the end of the loop */
+ sync_in_progress = true;
+
+ /* Now scan the hashtable for fsync requests to process */
+ absorb_counter = FSYNCS_PER_ABSORB;
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ int failures;
+
+ /*
+ * If fsync is off then we don't have to bother opening the file at
+ * all. (We delay checking until this point so that changing fsync on
+ * the fly behaves sensibly.)
+ */
+ if (!enableFsync)
+ continue;
+
+ /*
+ * If the entry is new then don't process it this time; it is new.
+ * Note "continue" bypasses the hash-remove call at the bottom of the
+ * loop.
+ */
+ if (entry->cycle_ctr == sync_cycle_ctr)
+ continue;
+
+ /* Else assert we haven't missed it */
+ Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+ /*
+ * If in checkpointer, we want to absorb pending requests every so
+ * often to prevent overflow of the fsync request queue. It is
+ * unspecified whether newly-added entries will be visited by
+ * hash_seq_search, but we don't care since we don't need to process
+ * them anyway.
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB;
+ }
+
+ /*
+ * The fsync table could contain requests to fsync segments that have
+ * been deleted (unlinked) by the time we get to them. Rather than
+ * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+ * what we do on error is absorb pending requests and then retry.
+ * Since mdunlink() queues a "cancel" message before actually
+ * unlinking, the fsync request is guaranteed to be marked canceled
+ * after the absorb if it really was this case. DROP DATABASE likewise
+ * has to tell us to forget fsync requests before it starts deletions.
+ */
+ for (failures = 0; !entry->canceled; failures++)
+ {
+ char path[MAXPGPATH];
+
+ INSTR_TIME_SET_CURRENT(sync_start);
+ if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
+ path) == 0)
+ {
+ /* Success; update statistics about sync timing */
+ INSTR_TIME_SET_CURRENT(sync_end);
+ sync_diff = sync_end;
+ INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+ elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+ if (elapsed > longest)
+ longest = elapsed;
+ total_elapsed += elapsed;
+ processed++;
+
+ if (log_checkpoints)
+ elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
+ processed,
+ path,
+ (double) elapsed / 1000);
+
+ break; /* out of retry loop */
+ }
+
+ /*
+ * It is possible that the relation has been dropped or truncated
+ * since the fsync request was entered. Therefore, allow ENOENT,
+ * but only if we didn't fail already on this file.
+ */
+ if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ path)));
+ else
+ ereport(DEBUG1,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\" but retrying: %m",
+ path)));
+
+ /*
+ * Absorb incoming requests and check to see if a cancel arrived
+ * for this relation fork.
+ */
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+ } /* end retry loop */
+
+ /* We are done with this entry, remove it */
+ if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOps corrupted");
+ } /* end loop over hashtable entries */
+
+ /* Return sync performance metrics for report at checkpoint end */
+ CheckpointStats.ckpt_sync_rels = processed;
+ CheckpointStats.ckpt_longest_sync = longest;
+ CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+ /* Flag successful completion of ProcessSyncRequests */
+ sync_in_progress = false;
+}
+
+/*
+ * RememberSyncRequest() -- callback from checkpointer side of sync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint. UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * See sync.h for more information on the types of sync requests supported.
+ */
+void
+RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
+{
+ Assert(pendingOps);
+
+ if (type == SYNC_FORGET_REQUEST)
+ {
+ PendingFsyncEntry *entry;
+
+ /* Cancel previously entered request */
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ (void *) ftag,
+ HASH_FIND,
+ NULL);
+ if (entry != NULL)
+ entry->canceled = true;
+ }
+ else if (type == SYNC_FILTER_REQUEST)
+ {
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *entry;
+ ListCell *cell,
+ *prev,
+ *next;
+
+ /* Cancel matching fsync requests */
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ if (entry->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+ entry->canceled = true;
+ }
+
+ /* Remove matching unlink requests */
+ prev = NULL;
+ for (cell = list_head(pendingUnlinks); cell; cell = next)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+ next = lnext(cell);
+ if (entry->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+ {
+ pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
+ pfree(entry);
+ }
+ else
+ prev = cell;
+ }
+ }
+ else if (type == SYNC_UNLINK_REQUEST)
+ {
+ /* Unlink request: put it in the linked list */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingUnlinkEntry *entry;
+
+ entry = palloc(sizeof(PendingUnlinkEntry));
+ entry->tag = *ftag;
+ entry->cycle_ctr = checkpoint_cycle_ctr;
+
+ pendingUnlinks = lappend(pendingUnlinks, entry);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+ else
+ {
+ /* Normal case: enter a request to fsync this segment */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingFsyncEntry *entry;
+ bool found;
+
+ Assert(type == SYNC_REQUEST);
+
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ (void *) ftag,
+ HASH_ENTER,
+ &found);
+ /* if new entry, initialize it */
+ if (!found)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ entry->canceled = false;
+ }
+
+ /*
+ * NB: it's intentional that we don't change cycle_ctr if the entry
+ * already exists. The cycle_ctr must represent the oldest fsync
+ * request that could be in the entry.
+ */
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * Register the sync request locally, or forward it to the checkpointer.
+ *
+ * If retryOnError is true, we'll keep trying if there is no space in the
+ * queue. Return true if we succeeded, or false if there wasn't space.
+ */
+bool
+RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+ bool retryOnError)
+{
+ bool ret;
+
+ if (pendingOps != NULL)
+ {
+ /* standalone backend or startup process: fsync state is local */
+ RememberSyncRequest(ftag, type);
+ return true;
+ }
+
+ for (;;)
+ {
+ /*
+ * Notify the checkpointer about it. If we fail to queue a message
+ * in retryOnError mode, we have to sleep and try again ... ugly, but
+ * hopefully won't happen often.
+ *
+ * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
+ * error in the case of SYNC_UNLINK_REQUEST would leave the
+ * no-longer-used file still present on disk, which would be bad, so
+ * I'm inclined to assume that the checkpointer will always empty the
+ * queue soon.
+ */
+ ret = ForwardSyncRequest(ftag, type);
+
+ /*
+ * If we are successful in queueing the request, or we failed and were
+ * instructed not to retry on error, break.
+ */
+ if (ret || (!ret && !retryOnError))
+ break;
+
+ pg_usleep(10000L);
+ }
+
+ return ret;
+}
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingOps during initialization of the startup
+ * process. Calling this function drops the local pendingOps so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+EnableSyncRequestForwarding(void)
+{
+ /* Perform any pending fsyncs we may have queued up, then drop table */
+ if (pendingOps)
+ {
+ ProcessSyncRequests();
+ hash_destroy(pendingOps);
+ }
+ pendingOps = NULL;
+
+ /*
+ * We should not have any pending unlink requests, since mdunlink doesn't
+ * queue unlink requests when isRedo.
+ */
+ Assert(pendingUnlinks == NIL);
+}
#include "storage/proc.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
+#include "storage/sync.h"
#include "tcop/tcopprot.h"
#include "utils/acl.h"
#include "utils/fmgroids.h"
/* Do local initialization of file, storage and buffer managers */
InitFileAccess();
+ InitSync();
smgrinit();
InitBufferPoolAccess();
}
#include "storage/block.h"
#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
/* GUC options */
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
-extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
- BlockNumber segno);
-extern void AbsorbFsyncRequests(void);
+extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
+
+extern void AbsorbSyncRequests(void);
extern Size CheckpointerShmemSize(void);
extern void CheckpointerShmemInit(void);
*/
extern int max_safe_fds;
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get. Ugh. This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting canceled ... see ProcessSyncRequests).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
+#endif
/*
* prototypes for functions in fd.c
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * md.h
+ * magnetic disk storage manager public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/md.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MD_H
+#define MD_H
+
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+
+/* md storage manager functionality */
+extern void mdinit(void);
+extern void mdclose(SMgrRelation reln, ForkNumber forknum);
+extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
+extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
+extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ char *buffer);
+extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber nblocks);
+extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+
+extern void ForgetDatabaseSyncRequests(Oid dbid);
+extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
+
+/* md sync callbacks */
+extern int mdsyncfiletag(const FileTag *ftag, char *path);
+extern int mdunlinkfiletag(const FileTag *ftag, char *path);
+extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
+
+#endif /* MD_H */
#include "storage/block.h"
#include "storage/relfilenode.h"
-
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void smgrpreckpt(void);
-extern void smgrsync(void);
-extern void smgrpostckpt(void);
extern void AtEOXact_SMgr(void);
-
-/* internals: move me elsewhere -- ay 7/94 */
-
-/* in md.c */
-extern void mdinit(void);
-extern void mdclose(SMgrRelation reln, ForkNumber forknum);
-extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
-extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum);
-extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- char *buffer);
-extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
-extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks);
-extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void mdpreckpt(void);
-extern void mdsync(void);
-extern void mdpostckpt(void);
-
-extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
- BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
-extern void ForgetDatabaseFsyncRequests(Oid dbid);
-extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
-
#endif /* SMGR_H */
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * sync.h
+ * File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sync.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNC_H
+#define SYNC_H
+
+#include "storage/relfilenode.h"
+
+/*
+ * Type of sync request. These are used to manage the set of pending
+ * requests to call a sync handler's sync or unlink functions at the next
+ * checkpoint.
+ */
+typedef enum SyncRequestType
+{
+ SYNC_REQUEST, /* schedule a call of sync function */
+ SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */
+ SYNC_FORGET_REQUEST, /* forget all calls for a tag */
+ SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */
+} SyncRequestType;
+
+/*
+ * Which set of functions to use to handle a given request. See the function
+ * table in sync.c.
+ */
+typedef enum SyncRequestHandler
+{
+ SYNC_HANDLER_MD = 0 /* md smgr */
+} SyncRequestHandler;
+
+/*
+ * A tag identifying a file. Currently it has the members required for md.c's
+ * usage, but sync.c has no knowledge of the internal structure, and it is
+ * liable to change as required by future handlers.
+ */
+typedef struct FileTag
+{
+ int16 handler; /* SyncRequstHandler value, saving space */
+ int16 forknum; /* ForkNumber, saving space */
+ RelFileNode rnode;
+ uint32 segno;
+} FileTag;
+
+extern void InitSync(void);
+extern void SyncPreCheckpoint(void);
+extern void SyncPostCheckpoint(void);
+extern void ProcessSyncRequests(void);
+extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
+extern void EnableSyncRequestForwarding(void);
+extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+ bool retryOnError);
+
+#endif /* SYNC_H */
FileFdwExecutionState
FileFdwPlanState
FileNameMap
+FileTag
FindSplitData
FixedParallelExecutorState
FixedParallelState
PathTarget
Pattern_Prefix_Status
Pattern_Type
-PendingOperationEntry
+PendingFsyncEntry
PendingRelDelete
PendingUnlinkEntry
PendingWriteback
SubscriptionInfo
SubscriptionRelState
Syn
+SyncOps
SyncRepConfigData
+SyncRequestHandler
+SyncRequestType
SysScanDesc
SyscacheCallbackFunction
SystemRowsSamplerData