From f4fd26775b8b6e7090774bb3d794b8529771fce5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 23 Jan 2009 14:31:41 +0200 Subject: [PATCH] Import Simon's recovery infrastructure patch v9 --- src/backend/access/transam/clog.c | 3 + src/backend/access/transam/multixact.c | 14 +- src/backend/access/transam/subtrans.c | 3 + src/backend/access/transam/xact.c | 3 + src/backend/access/transam/xlog.c | 723 ++++++++++++++++++------ src/backend/postmaster/bgwriter.c | 420 ++++++++++---- src/backend/postmaster/postmaster.c | 62 +- src/backend/storage/buffer/README | 9 + src/bin/pg_controldata/pg_controldata.c | 3 + src/bin/pg_resetxlog/pg_resetxlog.c | 2 + src/include/access/xlog.h | 14 +- src/include/access/xlog_internal.h | 4 + src/include/catalog/pg_control.h | 5 +- src/include/postmaster/bgwriter.h | 6 + src/include/storage/pmsignal.h | 1 + 15 files changed, 959 insertions(+), 313 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 528a219db4..5bd72154c5 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -475,6 +475,9 @@ ZeroCLOGPage(int pageno, bool writeXlog) /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized ShmemVariableCache->nextXid. + * + * We access just a single clog page, so this action is atomic and safe + * for use if other processes are active during recovery. */ void StartupCLOG(void) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 7314341101..881a588d69 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1413,8 +1413,11 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog) * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we * may already have replayed WAL data into the SLRU files. * - * We don't need any locks here, really; the SLRU locks are taken - * only because slru.c expects to be called with locks held. + * We want this operation to be atomic to ensure that other processes can + * use MultiXact while we complete recovery. We access one page only from the + * offset and members buffers, so once locks are acquired they will not be + * dropped and re-acquired by SLRU code. So we take both locks at start, then + * hold them all the way to the end. */ void StartupMultiXact(void) @@ -1426,6 +1429,7 @@ StartupMultiXact(void) /* Clean up offsets state */ LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* * Initialize our idea of the latest page number. @@ -1452,10 +1456,7 @@ StartupMultiXact(void) MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } - LWLockRelease(MultiXactOffsetControlLock); - /* And the same for members */ - LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* * Initialize our idea of the latest page number. @@ -1483,6 +1484,7 @@ StartupMultiXact(void) } LWLockRelease(MultiXactMemberControlLock); + LWLockRelease(MultiXactOffsetControlLock); /* * Initialize lastTruncationPoint to invalid, ensuring that the first @@ -1543,7 +1545,7 @@ CheckPointMultiXact(void) * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. */ - if (!InRecovery) + if (!IsRecoveryProcessingMode()) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 0dbd2166be..eaad23182a 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -226,6 +226,9 @@ ZeroSUBTRANSPage(int pageno) * * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid * if there are none. + * + * Note that this is not atomic and is not yet safe to perform while other + * processes might access subtrans. */ void StartupSUBTRANS(TransactionId oldestActiveXID) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c94e2a2251..d0ed3c0318 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -394,6 +394,9 @@ AssignTransactionId(TransactionState s) bool isSubXact = (s->parent != NULL); ResourceOwner currentOwner; + if (IsRecoveryProcessingMode()) + elog(FATAL, "cannot assign TransactionIds during recovery"); + /* Assert that caller didn't screw up */ Assert(!TransactionIdIsValid(s->transactionId)); Assert(s->state == TRANS_INPROGRESS); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index bd6035d4a6..7e480e2fb2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -115,7 +115,8 @@ CheckpointStatsData CheckpointStats; /* * ThisTimeLineID will be same in all backends --- it identifies current - * WAL timeline for the database system. + * WAL timeline for the database system. Zero is always a bug, so we + * start with that to allow us to spot any errors. */ TimeLineID ThisTimeLineID = 0; @@ -125,6 +126,10 @@ bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ static bool InArchiveRecovery = false; +/* Local copy of shared RecoveryProcessingMode state */ +static bool LocalRecoveryProcessingMode = true; +static bool knownProcessingMode = false; + /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; @@ -143,6 +148,9 @@ static TransactionId recoveryStopXid; static TimestampTz recoveryStopTime; static bool recoveryStopAfter; +/* is the database proven consistent yet? */ +bool reachedSafeStartPoint = false; + /* * During normal operation, the only timeline we care about is ThisTimeLineID. * During recovery, however, things are more complicated. To simplify life @@ -242,10 +250,30 @@ static XLogRecPtr RedoRecPtr; * ControlFileLock: must be held to read/update control file or create * new log file. * - * CheckpointLock: must be held to do a checkpoint (ensures only one - * checkpointer at a time; currently, with all checkpoints done by the - * bgwriter, this is just pro forma). + * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring + * we get just one of those at any time. In 8.4+ recovery, both startup and + * bgwriter processes may take restartpoints, so this locking must be strict + * to ensure there are no mistakes. + * + * In 8.4 we progress through a number of states at startup. Initially, the + * postmaster is in PM_STARTUP state and spawns the Startup process. We then + * progress until the database is in a consistent state, then if we are in + * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts + * up and takes over responsibility for performing restartpoints. We then + * progress until the end of recovery when we enter PM_RUN state upon + * termination of the Startup process. In summary: + * + * PM_STARTUP state: Startup process performs restartpoints + * PM_RECOVERY state: bgwriter process performs restartpoints + * PM_RUN state: bgwriter process performs checkpoints * + * These transitions are fairly delicate, with many things that need to + * happen at the same time in order to change state successfully throughout + * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can + * prove the databases are in a consistent state. Changing from PM_RECOVERY + * to PM_RUN happens whenever recovery ends, which could be forced upon us + * externally or it can occur becasue of damage or termination of the WAL + * sequence. *---------- */ @@ -287,11 +315,18 @@ typedef struct XLogCtlWrite /* * Total shared-memory state for XLOG. + * + * This small structure is accessed by many backends, so we take care to + * pad out the parts of the structure so they can be accessed by separate + * CPUs without causing false sharing cache flushes. Padding is generous + * to allow for a wide variety of CPU architectures. */ +#define XLOGCTL_BUFFER_SPACING 128 typedef struct XLogCtlData { /* Protected by WALInsertLock: */ XLogCtlInsert Insert; + char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)]; /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; @@ -299,9 +334,16 @@ typedef struct XLogCtlData uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */ TransactionId ckptXid; XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */ + /* add data structure padding for above info_lck declarations */ + char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) + - sizeof(XLogwrtResult) + - sizeof(uint32) + - sizeof(TransactionId) + - sizeof(XLogRecPtr)]; /* Protected by WALWriteLock: */ XLogCtlWrite Write; + char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)]; /* * These values do not change after startup, although the pointed-to pages @@ -313,6 +355,24 @@ typedef struct XLogCtlData int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; + /* + * IsRecoveryProcessingMode shows whether the postmaster is in a + * postmaster state earlier than PM_RUN, or not. This is a globally + * accessible state to allow EXEC_BACKEND case. + * + * We also retain a local state variable InRecovery. InRecovery=true + * means the code is being executed by Startup process and therefore + * always during Recovery Processing Mode. This allows us to identify + * code executed *during* Recovery Processing Mode but not necessarily + * by Startup process itself. + * + * Protected by mode_lck + */ + bool SharedRecoveryProcessingMode; + slock_t mode_lck; + + char InfoLockPadding[XLOGCTL_BUFFER_SPACING]; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -399,8 +459,10 @@ static void XLogArchiveCleanup(const char *xlog); static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg); +static void exitRecovery(void); static bool recoveryStopsHere(XLogRecord *record, bool *includeThis); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); +static XLogRecPtr GetRedoLocationForCheckpoint(void); static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites, XLogRecPtr *lsn, BkpBlock *bkpb); @@ -483,6 +545,11 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END); + + /* cross-check on whether we should be here or not */ + if (IsRecoveryProcessingMode() && !isRecoveryEnd) + elog(FATAL, "cannot make new WAL entries during recovery"); /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) @@ -1729,8 +1796,7 @@ XLogFlush(XLogRecPtr record) XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; - /* Disabled during REDO */ - if (InRedo) + if (IsRecoveryProcessingMode()) return; /* Quick exit if already known flushed */ @@ -1818,9 +1884,9 @@ XLogFlush(XLogRecPtr record) * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we - * cannot get here while InRedo is true, but if the bad page is brought in - * and marked dirty during recovery then CreateCheckPoint will try to - * flush it at the end of recovery.) + * cannot get here while IsRecoveryProcessingMode(), but if the bad page is + * brought in and marked dirty during recovery then if a checkpoint were + * performed at the end of recovery it will try to flush it. * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if @@ -1830,7 +1896,7 @@ XLogFlush(XLogRecPtr record) * and so we will not force a restart for a bad LSN on a data page. */ if (XLByteLT(LogwrtResult.Flush, record)) - elog(InRecovery ? WARNING : ERROR, + elog(ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); @@ -2103,7 +2169,8 @@ XLogFileInit(uint32 log, uint32 seg, unlink(tmppath); } - elog(DEBUG2, "done creating and filling new WAL file"); + XLogFileName(tmppath, ThisTimeLineID, log, seg); + elog(DEBUG2, "done creating and filling new WAL file %s", tmppath); /* Set flag to tell caller there was no existent file */ *use_existent = false; @@ -2409,6 +2476,28 @@ XLogFileRead(uint32 log, uint32 seg, int emode) xlogfname); set_ps_display(activitymsg, false); + /* + * Calculate and write out a new safeStartPoint. This defines + * the latest LSN that might appear on-disk while we apply + * the WAL records in this file. If we crash during recovery + * we must reach this point again before we can prove + * database consistency. Not a restartpoint! Restart points + * define where we should start recovery from, if we crash. + */ + if (InArchiveRecovery) + { + uint32 nextLog = log; + uint32 nextSeg = seg; + + NextLogSeg(nextLog, nextSeg); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->minSafeStartPoint.xlogid = nextLog; + ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + return fd; } if (errno != ENOENT) /* unexpected failure? */ @@ -4283,6 +4372,7 @@ XLOGShmemInit(void) XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages); SpinLockInit(&XLogCtl->info_lck); + SpinLockInit(&XLogCtl->mode_lck); /* * If we are not in bootstrap mode, pg_control should already exist. Read @@ -4593,12 +4683,12 @@ readRecoveryCommandFile(void) * does nothing if a recovery_target is not also set */ if (!parse_bool(tok2, &recoveryLogRestartpoints)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); ereport(LOG, - (errmsg("log_restartpoints = %s", tok2))); - } + (errmsg("log_restartpoints = %s", tok2))); + } else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", @@ -4733,15 +4823,13 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg) unlink(recoveryPath); /* ignore any error */ /* - * Rename the config file out of the way, so that we don't accidentally - * re-enter archive recovery mode in a subsequent crash. + * As of 8.4 we no longer rename the recovery.conf file out of the + * way until after we have performed a full checkpoint. This ensures + * that any crash between now and the end of the checkpoint does not + * attempt to restart from a WAL file that is no longer available to us. + * As soon as we remove recovery.conf we lose our recovery_command and + * cannot reaccess WAL files from the archive. */ - unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE))); ereport(LOG, (errmsg("archive recovery complete"))); @@ -4876,6 +4964,7 @@ StartupXLOG(void) CheckPoint checkPoint; bool wasShutdown; bool reachedStopPoint = false; + bool performedRecovery = false; bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, @@ -4888,6 +4977,8 @@ StartupXLOG(void) uint32 freespace; TransactionId oldestActiveXID; + XLogCtl->SharedRecoveryProcessingMode = true; + /* * Read control file and check XLOG status looks valid. * @@ -5108,8 +5199,14 @@ StartupXLOG(void) if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0) ControlFile->minRecoveryPoint = minRecoveryLoc; ControlFile->time = (pg_time_t) time(NULL); + /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); + /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the @@ -5217,6 +5314,32 @@ StartupXLOG(void) LastRec = ReadRecPtr; + /* + * Have we reached our safe starting point? If so, we can + * signal Postmaster to enter consistent recovery mode. + * + * There are two point in the log we must pass. The first is + * the minRecoveryPoint, which is the LSN at the time the + * base backup was taken that we are about to rollfoward from. + * If recovery has ever crashed or was stopped there is + * another point also: minSafeStartPoint, which we know the + * latest LSN that recovery could have reached prior to crash. + */ + if (!reachedSafeStartPoint && + XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && + XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr)) + { + reachedSafeStartPoint = true; + if (InArchiveRecovery) + { + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + EndRecPtr.xlogid, EndRecPtr.xrecoff))); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_START); + } + } + record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); @@ -5238,6 +5361,7 @@ StartupXLOG(void) /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + reachedSafeStartPoint = true; } } @@ -5251,9 +5375,9 @@ StartupXLOG(void) /* * Complain if we did not roll forward far enough to render the backup - * dump consistent. + * dump consistent and start safely. */ - if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) + if (InRecovery && !reachedSafeStartPoint) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, @@ -5375,39 +5499,14 @@ StartupXLOG(void) XLogCheckInvalidPages(); /* - * Reset pgstat data, because it may be invalid after recovery. + * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote + * a shutdown checkpoint here, but we ask bgwriter to do that now. */ - pgstat_reset_all(); + exitRecovery(); - /* - * Perform a checkpoint to update all our recovery activity to disk. - * - * Note that we write a shutdown checkpoint rather than an on-line - * one. This is not particularly critical, but since we may be - * assigning a new TLI, using a shutdown checkpoint allows us to have - * the rule that TLI only changes in shutdown checkpoints, which - * allows some extra error checking in xlog_redo. - */ - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + performedRecovery = true; } - /* - * Preallocate additional log files, if wanted. - */ - PreallocXlogFiles(EndOfLog); - - /* - * Okay, we're officially UP. - */ - InRecovery = false; - - ControlFile->state = DB_IN_PRODUCTION; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); - - /* start the archive_timeout timer running */ - XLogCtl->Write.lastSegSwitchTime = ControlFile->time; - /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid; @@ -5441,6 +5540,89 @@ StartupXLOG(void) readRecordBuf = NULL; readRecordBufSize = 0; } + + /* + * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery. + * This could add minutes to the startup time, so we want bgwriter + * to perform it. This then frees the Startup process to complete so we can + * allow transactions and WAL inserts. We still write a checkpoint, but + * it will be an online checkpoint. Online checkpoints have a redo + * location that can be prior to the actual checkpoint record. So we want + * to derive that redo location *before* we let anybody else write WAL, + * otherwise we might miss some WAL records if we crash. + */ + if (performedRecovery) + { + XLogRecPtr redo; + + /* + * We must grab the pointer before anybody writes WAL + */ + redo = GetRedoLocationForCheckpoint(); + + /* + * Tell the bgwriter + */ + SetRedoLocationForArchiveCheckpoint(redo); + + /* + * Okay, we can come up now. Allow others to write WAL. + */ + XLogCtl->SharedRecoveryProcessingMode = false; + + /* + * Now request checkpoint + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE); + } + else + { + /* + * No recovery, so lets just get on with it. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Okay, we're officially UP. + */ + XLogCtl->SharedRecoveryProcessingMode = false; + } + + /* start the archive_timeout timer running */ + XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL); + +} + +/* + * IsRecoveryProcessingMode() + * + * Fast test for whether we're still in recovery or not. We test the shared + * state each time only until we leave recovery mode. After that we never + * look again, relying upon the settings of our local state variables. This + * is designed to avoid the need for a separate initialisation step. + */ +bool +IsRecoveryProcessingMode(void) +{ + if (knownProcessingMode && !LocalRecoveryProcessingMode) + return false; + + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->mode_lck); + LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode; + SpinLockRelease(&xlogctl->mode_lck); + } + + knownProcessingMode = true; + + return LocalRecoveryProcessingMode; } /* @@ -5698,20 +5880,24 @@ ShutdownXLOG(int code, Datum arg) static void LogCheckpointStart(int flags) { - elog(LOG, "checkpoint starting:%s%s%s%s%s%s", - (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", - (flags & CHECKPOINT_FORCE) ? " force" : "", - (flags & CHECKPOINT_WAIT) ? " wait" : "", - (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "", - (flags & CHECKPOINT_CAUSE_TIME) ? " time" : ""); + if (flags & CHECKPOINT_RESTARTPOINT) + elog(LOG, "restartpoint starting:%s", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : ""); + else + elog(LOG, "checkpoint starting:%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : ""); } /* * Log end of a checkpoint. */ static void -LogCheckpointEnd(void) +LogCheckpointEnd(int flags) { long write_secs, sync_secs, @@ -5734,17 +5920,26 @@ LogCheckpointEnd(void) CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); - elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " - "%d transaction log file(s) added, %d removed, %d recycled; " - "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", - CheckpointStats.ckpt_bufs_written, - (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, - CheckpointStats.ckpt_segs_added, - CheckpointStats.ckpt_segs_removed, - CheckpointStats.ckpt_segs_recycled, - write_secs, write_usecs / 1000, - sync_secs, sync_usecs / 1000, - total_secs, total_usecs / 1000); + if (flags & CHECKPOINT_RESTARTPOINT) + elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + write_secs, write_usecs / 1000, + sync_secs, sync_usecs / 1000, + total_secs, total_usecs / 1000); + else + elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " + "%d transaction log file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_secs, write_usecs / 1000, + sync_secs, sync_usecs / 1000, + total_secs, total_usecs / 1000); } /* @@ -5769,17 +5964,16 @@ CreateCheckPoint(int flags) XLogRecPtr recptr; XLogCtlInsert *Insert = &XLogCtl->Insert; XLogRecData rdata; - uint32 freespace; uint32 _logId; uint32 _logSeg; TransactionId *inCommitXids; int nInCommit; + bool leavingArchiveRecovery = false; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. - * (This is just pro forma, since in the present system structure there is - * only one process that is allowed to issue checkpoints at any given - * time.) + * That shouldn't be happening, but checkpoints are an important aspect + * of our resilience, so we take no chances. */ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); @@ -5793,6 +5987,13 @@ CreateCheckPoint(int flags) MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + /* + * Find out if this is the first checkpoint after archive recovery. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY); + LWLockRelease(ControlFileLock); + /* * Use a critical section to force system panic if we have trouble. */ @@ -5800,9 +6001,11 @@ CreateCheckPoint(int flags) if (shutdown) { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNING; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); } /* @@ -5817,90 +6020,68 @@ CreateCheckPoint(int flags) checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.time = (pg_time_t) time(NULL); - /* - * We must hold WALInsertLock while examining insert state to determine - * the checkpoint REDO pointer. - */ - LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); + if (leavingArchiveRecovery) + checkPoint.redo = GetRedoLocationForArchiveCheckpoint(); + else + { + /* + * We must hold WALInsertLock while examining insert state to determine + * the checkpoint REDO pointer. + */ + LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); - /* - * If this isn't a shutdown or forced checkpoint, and we have not inserted - * any XLOG records since the start of the last checkpoint, skip the - * checkpoint. The idea here is to avoid inserting duplicate checkpoints - * when the system is idle. That wastes log space, and more importantly it - * exposes us to possible loss of both current and previous checkpoint - * records if the machine crashes just as we're writing the update. - * (Perhaps it'd make even more sense to checkpoint only when the previous - * checkpoint record is in a different xlog page?) - * - * We have to make two tests to determine that nothing has happened since - * the start of the last checkpoint: current insertion point must match - * the end of the last checkpoint record, and its redo pointer must point - * to itself. - */ - if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0) - { - XLogRecPtr curInsert; - - INSERT_RECPTR(curInsert, Insert, Insert->curridx); - if (curInsert.xlogid == ControlFile->checkPoint.xlogid && - curInsert.xrecoff == ControlFile->checkPoint.xrecoff + - MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) && - ControlFile->checkPoint.xlogid == - ControlFile->checkPointCopy.redo.xlogid && - ControlFile->checkPoint.xrecoff == - ControlFile->checkPointCopy.redo.xrecoff) + /* + * If this isn't a shutdown or forced checkpoint, and we have not inserted + * any XLOG records since the start of the last checkpoint, skip the + * checkpoint. The idea here is to avoid inserting duplicate checkpoints + * when the system is idle. That wastes log space, and more importantly it + * exposes us to possible loss of both current and previous checkpoint + * records if the machine crashes just as we're writing the update. + * (Perhaps it'd make even more sense to checkpoint only when the previous + * checkpoint record is in a different xlog page?) + * + * We have to make two tests to determine that nothing has happened since + * the start of the last checkpoint: current insertion point must match + * the end of the last checkpoint record, and its redo pointer must point + * to itself. + */ + if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0) { - LWLockRelease(WALInsertLock); - LWLockRelease(CheckpointLock); - END_CRIT_SECTION(); - return; + XLogRecPtr curInsert; + + INSERT_RECPTR(curInsert, Insert, Insert->curridx); + if (curInsert.xlogid == ControlFile->checkPoint.xlogid && + curInsert.xrecoff == ControlFile->checkPoint.xrecoff + + MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) && + ControlFile->checkPoint.xlogid == + ControlFile->checkPointCopy.redo.xlogid && + ControlFile->checkPoint.xrecoff == + ControlFile->checkPointCopy.redo.xrecoff) + { + LWLockRelease(WALInsertLock); + LWLockRelease(CheckpointLock); + END_CRIT_SECTION(); + return; + } } - } - - /* - * Compute new REDO record ptr = location of next XLOG record. - * - * NB: this is NOT necessarily where the checkpoint record itself will be, - * since other backends may insert more XLOG records while we're off doing - * the buffer flush work. Those XLOG records are logically after the - * checkpoint, even though physically before it. Got that? - */ - freespace = INSERT_FREESPACE(Insert); - if (freespace < SizeOfXLogRecord) - { - (void) AdvanceXLInsertBuffer(false); - /* OK to ignore update return flag, since we will do flush anyway */ - freespace = INSERT_FREESPACE(Insert); - } - INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx); - /* - * Here we update the shared RedoRecPtr for future XLogInsert calls; this - * must be done while holding the insert lock AND the info_lck. - * - * Note: if we fail to complete the checkpoint, RedoRecPtr will be left - * pointing past where it really needs to point. This is okay; the only - * consequence is that XLogInsert might back up whole buffers that it - * didn't really need to. We can't postpone advancing RedoRecPtr because - * XLogInserts that happen while we are dumping buffers must assume that - * their buffer changes are not included in the checkpoint. - */ - { - /* use volatile pointer to prevent code rearrangement */ - volatile XLogCtlData *xlogctl = XLogCtl; + /* + * Compute new REDO record ptr = location of next XLOG record. + * + * NB: this is NOT necessarily where the checkpoint record itself will be, + * since other backends may insert more XLOG records while we're off doing + * the buffer flush work. Those XLOG records are logically after the + * checkpoint, even though physically before it. Got that? + */ + checkPoint.redo = GetRedoLocationForCheckpoint(); - SpinLockAcquire(&xlogctl->info_lck); - RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo; - SpinLockRelease(&xlogctl->info_lck); + /* + * Now we can release WAL insert lock, allowing other xacts to proceed + * while we are flushing disk buffers. + */ + LWLockRelease(WALInsertLock); } - /* - * Now we can release WAL insert lock, allowing other xacts to proceed - * while we are flushing disk buffers. - */ - LWLockRelease(WALInsertLock); - /* * If enabled, log checkpoint start. We postpone this until now so as not * to log anything if we decided to skip the checkpoint. @@ -6010,18 +6191,43 @@ CreateCheckPoint(int flags) XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg); /* - * Update the control file. + * Update the control file. In 8.4, this routine becomes the primary + * point for recording changes of state in the control file at the + * end of recovery. Postmaster state already shows us being in + * normal running mode, but it is only after this point that we + * are completely free of reperforming a recovery if we crash. Note + * that this is executed by bgwriter after the death of Startup process. */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (shutdown) ControlFile->state = DB_SHUTDOWNED; + else + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ProcLastRecPtr; ControlFile->checkPointCopy = checkPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); + if (leavingArchiveRecovery) + { + /* + * Rename the config file out of the way, so that we don't accidentally + * re-enter archive recovery mode in a subsequent crash. Prior to + * 8.4 this step was performed at end of exitArchiveRecovery(). + */ + unlink(RECOVERY_COMMAND_DONE); + if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE))); + } + /* Update shared-memory copy of checkpoint XID/epoch */ { /* use volatile pointer to prevent code rearrangement */ @@ -6068,12 +6274,11 @@ CreateCheckPoint(int flags) * in subtrans.c). During recovery, though, we mustn't do this because * StartupSUBTRANS hasn't been called yet. */ - if (!InRecovery) - TruncateSUBTRANS(GetOldestXmin(true, false)); + TruncateSUBTRANS(GetOldestXmin(true, false)); /* All real work is done, but log before releasing lock. */ if (log_checkpoints) - LogCheckpointEnd(); + LogCheckpointEnd(flags); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, @@ -6083,6 +6288,51 @@ CreateCheckPoint(int flags) LWLockRelease(CheckpointLock); } +/* + * GetRedoLocationForCheckpoint() + * + * When !IsRecoveryProcessingMode() this must be called while holding + * WALInsertLock(). + */ +static XLogRecPtr +GetRedoLocationForCheckpoint() +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace; + XLogRecPtr redo; + + freespace = INSERT_FREESPACE(Insert); + if (freespace < SizeOfXLogRecord) + { + (void) AdvanceXLInsertBuffer(false); + /* OK to ignore update return flag, since we will do flush anyway */ + freespace = INSERT_FREESPACE(Insert); + } + INSERT_RECPTR(redo, Insert, Insert->curridx); + + /* + * Here we update the shared RedoRecPtr for future XLogInsert calls; this + * must be done while holding the insert lock AND the info_lck. + * + * Note: if we fail to complete the checkpoint, RedoRecPtr will be left + * pointing past where it really needs to point. This is okay; the only + * consequence is that XLogInsert might back up whole buffers that it + * didn't really need to. We can't postpone advancing RedoRecPtr because + * XLogInserts that happen while we are dumping buffers must assume that + * their buffer changes are not included in the checkpoint. + */ + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo; + SpinLockRelease(&xlogctl->info_lck); + } + + return redo; +} + /* * Flush all data in shared memory to disk, and fsync * @@ -6147,29 +6397,69 @@ RecoveryRestartPoint(const CheckPoint *checkPoint) } } + RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint); +} + +/* + * As of 8.4, RestartPoints are always created by the bgwriter + * once we have reachedSafeStartPoint. We use bgwriter's shared memory + * area wherever we call it from, to keep better code structure. + */ +void +CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags) +{ + if (recoveryLogRestartpoints) + { + /* + * Prepare to accumulate statistics. + */ + + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags); + } + /* - * OK, force data out to disk + * Acquire CheckpointLock to ensure only one restartpoint happens at a time. + * We rely on this lock to ensure that the startup process doesn't exit + * Recovery while we are half way through a restartpoint. */ - CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + + CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags); /* - * Update pg_control so that any subsequent crash will restart from this - * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint - * record itself. + * Update pg_control, using current time */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->prevCheckPoint = ControlFile->checkPoint; - ControlFile->checkPoint = ReadRecPtr; - ControlFile->checkPointCopy = *checkPoint; + ControlFile->checkPoint = ReadPtr; + ControlFile->checkPointCopy = *restartPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Currently, there is no need to truncate pg_subtrans during recovery. + * If we did do that, we will need to have called StartupSUBTRANS() + * already and then TruncateSUBTRANS() would go here. + */ + + /* All real work is done, but log before releasing lock. */ + if (recoveryLogRestartpoints) + LogCheckpointEnd(CHECKPOINT_RESTARTPOINT); ereport((recoveryLogRestartpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", - checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); + restartPoint->redo.xlogid, restartPoint->redo.xrecoff))); + if (recoveryLastXTime) ereport((recoveryLogRestartpoints ? LOG : DEBUG2), - (errmsg("last completed transaction was at log time %s", - timestamptz_to_str(recoveryLastXTime)))); + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(recoveryLastXTime)))); + + LWLockRelease(CheckpointLock); } /* @@ -6234,7 +6524,63 @@ RequestXLogSwitch(void) } /* - * XLOG resource manager's routines + * exitRecovery() + * + * Exit recovery state and write a XLOG_RECOVERY_END record. This is the + * only record type that can record a change of timelineID. We assume + * caller has already set ThisTimeLineID, if appropriate. + */ +static void +exitRecovery(void) +{ + XLogRecData rdata; + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) (&ThisTimeLineID); + rdata.len = sizeof(TimeLineID); + rdata.next = NULL; + + /* + * If a restartpoint is in progress, we will not be able to successfully + * acquire CheckpointLock. If bgwriter is still in progress then send + * a second signal to nudge bgwriter to go faster so we can avoid delay. + * Then wait for lock, so we know the restartpoint has completed. We do + * this because we don't want to interrupt the restartpoint half way + * through, which might leave us in a mess and we want to be robust. We're + * going to checkpoint soon anyway, so not it's not wasted effort. + */ + if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE)) + LWLockRelease(CheckpointLock); + else + { + RequestRestartPointCompletion(); + ereport(LOG, + (errmsg("startup process waiting for restartpoint to complete"))); + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + LWLockRelease(CheckpointLock); + } + + /* + * This is the only type of WAL message that can be inserted during + * recovery. This ensures that we don't allow others to get access + * until after we have changed state. + */ + (void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata); + + /* + * We don't XLogFlush() here otherwise we'll end up zeroing the WAL + * file ourselves. So just let bgwriter's forthcoming checkpoint do + * that for us. + */ + + InRecovery = false; +} + +/* + * XLOG resource manager's routines. + * + * Definitions of message info are in include/catalog/pg_control.h, + * though not all messages relate to control file processing. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) @@ -6272,21 +6618,38 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; /* - * TLI may change in a shutdown checkpoint, but it shouldn't decrease + * TLI no longer changes at shutdown checkpoint, since as of 8.4, + * shutdown checkpoints only occur at shutdown. Much less confusing. */ - if (checkPoint.ThisTimeLineID != ThisTimeLineID) + + RecoveryRestartPoint(&checkPoint); + } + else if (info == XLOG_RECOVERY_END) + { + TimeLineID tli; + + memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID)); + + /* + * TLI may change when recovery ends, but it shouldn't decrease. + * + * This is the only WAL record that can tell us to change timelineID + * while we process WAL records. + * + * We can *choose* to stop recovery at any point, generating a + * new timelineID which is recorded using this record type. + */ + if (tli != ThisTimeLineID) { - if (checkPoint.ThisTimeLineID < ThisTimeLineID || + if (tli < ThisTimeLineID || !list_member_int(expectedTLIs, - (int) checkPoint.ThisTimeLineID)) + (int) tli)) ereport(PANIC, - (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", - checkPoint.ThisTimeLineID, ThisTimeLineID))); + (errmsg("unexpected timeline ID %u (after %u) at recovery end record", + tli, ThisTimeLineID))); /* Following WAL records should be run with new TLI */ - ThisTimeLineID = checkPoint.ThisTimeLineID; + ThisTimeLineID = tli; } - - RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -6309,7 +6672,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; - /* TLI should not change in an on-line checkpoint */ + /* TLI must not change at a checkpoint */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 6a0cd4eebf..3163fd3c1b 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -49,6 +49,7 @@ #include #include "access/xlog_internal.h" +#include "catalog/pg_control.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -129,6 +130,13 @@ typedef struct int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ + /* + * When the Startup process wants bgwriter to perform a restartpoint, it + * sets these fields so that we can update the control file afterwards. + */ + XLogRecPtr ReadPtr; /* Requested log pointer */ + CheckPoint restartPoint; /* restartPoint data for ControlFile */ + uint32 num_backend_writes; /* counts non-bgwriter buffer writes */ int num_requests; /* current # of requests */ @@ -165,7 +173,7 @@ static bool ckpt_active = false; /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; -static XLogRecPtr ckpt_start_recptr; +static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */ static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; @@ -197,6 +205,7 @@ BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; + bool BgWriterRecoveryMode; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; @@ -355,16 +364,17 @@ BackgroundWriterMain(void) */ PG_SETMASK(&UnBlockSig); + BgWriterRecoveryMode = IsRecoveryProcessingMode(); + + if (BgWriterRecoveryMode) + elog(DEBUG1, "bgwriter starting during recovery, pid = %u", + BgWriterShmem->bgwriter_pid); + /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. @@ -382,118 +392,204 @@ BackgroundWriterMain(void) got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; - } - if (shutdown_requested) - { - /* - * From here on, elog(ERROR) should end with exit(1), not send - * control back to the sigsetjmp block above - */ - ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); - /* Normal exit from the bgwriter is here */ - proc_exit(0); /* done */ - } - /* - * Force a checkpoint if too much time has elapsed since the last one. - * Note that we count a timed checkpoint in stats only when this - * occurs without an external request, but we set the CAUSE_TIME flag - * bit even if there is also an external request. - */ - now = (pg_time_t) time(NULL); - elapsed_secs = now - last_checkpoint_time; - if (elapsed_secs >= CheckPointTimeout) - { - if (!do_checkpoint) - BgWriterStats.m_timed_checkpoints++; - do_checkpoint = true; - flags |= CHECKPOINT_CAUSE_TIME; - } - - /* - * Do a checkpoint if requested, otherwise do one cycle of - * dirty-buffer writing. - */ - if (do_checkpoint) - { - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - /* - * Atomically fetch the request flags to figure out what kind of a - * checkpoint we should perform, and increase the started-counter - * to acknowledge that we've started a new checkpoint. - */ - SpinLockAcquire(&bgs->ckpt_lck); - flags |= bgs->ckpt_flags; - bgs->ckpt_flags = 0; - bgs->ckpt_started++; - SpinLockRelease(&bgs->ckpt_lck); - - /* - * We will warn if (a) too soon since last checkpoint (whatever - * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag - * since the last checkpoint start. Note in particular that this - * implementation will not generate warnings caused by - * CheckPointTimeout < CheckPointWarning. - */ - if ((flags & CHECKPOINT_CAUSE_XLOG) && - elapsed_secs < CheckPointWarning) - ereport(LOG, - (errmsg("checkpoints are occurring too frequently (%d seconds apart)", - elapsed_secs), - errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); - - /* - * Initialize bgwriter-private variables used during checkpoint. - */ - ckpt_active = true; - ckpt_start_recptr = GetInsertRecPtr(); - ckpt_start_time = now; - ckpt_cached_elapsed = 0; - - /* - * Do the checkpoint. - */ - CreateCheckPoint(flags); + if (BgWriterRecoveryMode) + { + if (shutdown_requested) + { + /* + * From here on, elog(ERROR) should end with exit(1), not send + * control back to the sigsetjmp block above + */ + ExitOnAnyError = true; + /* Normal exit from the bgwriter is here */ + proc_exit(0); /* done */ + } + + if (!IsRecoveryProcessingMode()) + { + elog(DEBUG2, "bgwriter changing from recovery to normal mode"); + + InitXLOGAccess(); + BgWriterRecoveryMode = false; + + /* + * Start time-driven events from now + */ + last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + + /* + * Notice that we do *not* act on a checkpoint_requested + * state at this point. We have changed mode, so we wish to + * perform a checkpoint not a restartpoint. + */ + continue; + } + + if (checkpoint_requested) + { + XLogRecPtr ReadPtr; + CheckPoint restartPoint; + + checkpoint_requested = false; + + /* + * Initialize bgwriter-private variables used during checkpoint. + */ + ckpt_active = true; + ckpt_start_time = (pg_time_t) time(NULL); + ckpt_cached_elapsed = 0; + + /* + * Get the requested values from shared memory that the + * Startup process has put there for us. + */ + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + ReadPtr = BgWriterShmem->ReadPtr; + memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint)); + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + /* Use smoothed writes, until interrupted if ever */ + CreateRestartPoint(ReadPtr, &restartPoint, 0); + + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + ckpt_active = false; + checkpoint_requested = false; + } + else + { + /* Clean buffers dirtied by recovery */ + BgBufferSync(); + + /* Nap for the configured time. */ + BgWriterNap(); + } + } + else /* Normal processing */ + { + bool do_checkpoint = false; + int flags = 0; + pg_time_t now; + int elapsed_secs; + + if (checkpoint_requested) + { + checkpoint_requested = false; + do_checkpoint = true; + BgWriterStats.m_requested_checkpoints++; + } + if (shutdown_requested) + { + /* + * From here on, elog(ERROR) should end with exit(1), not send + * control back to the sigsetjmp block above + */ + ExitOnAnyError = true; + /* Close down the database */ + ShutdownXLOG(0, 0); + /* Normal exit from the bgwriter is here */ + proc_exit(0); /* done */ + } /* - * After any checkpoint, close all smgr files. This is so we - * won't hang onto smgr references to deleted files indefinitely. + * Force a checkpoint if too much time has elapsed since the last one. + * Note that we count a timed checkpoint in stats only when this + * occurs without an external request, but we set the CAUSE_TIME flag + * bit even if there is also an external request. */ - smgrcloseall(); + now = (pg_time_t) time(NULL); + elapsed_secs = now - last_checkpoint_time; + if (elapsed_secs >= CheckPointTimeout) + { + if (!do_checkpoint) + BgWriterStats.m_timed_checkpoints++; + do_checkpoint = true; + flags |= CHECKPOINT_CAUSE_TIME; + } /* - * Indicate checkpoint completion to any waiting backends. + * Do a checkpoint if requested, otherwise do one cycle of + * dirty-buffer writing. */ - SpinLockAcquire(&bgs->ckpt_lck); - bgs->ckpt_done = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); + if (do_checkpoint) + { + /* use volatile pointer to prevent code rearrangement */ + volatile BgWriterShmemStruct *bgs = BgWriterShmem; + + /* + * Atomically fetch the request flags to figure out what kind of a + * checkpoint we should perform, and increase the started-counter + * to acknowledge that we've started a new checkpoint. + */ + SpinLockAcquire(&bgs->ckpt_lck); + flags |= bgs->ckpt_flags; + bgs->ckpt_flags = 0; + bgs->ckpt_started++; + SpinLockRelease(&bgs->ckpt_lck); + + /* + * We will warn if (a) too soon since last checkpoint (whatever + * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag + * since the last checkpoint start. Note in particular that this + * implementation will not generate warnings caused by + * CheckPointTimeout < CheckPointWarning. + */ + if ((flags & CHECKPOINT_CAUSE_XLOG) && + elapsed_secs < CheckPointWarning) + ereport(LOG, + (errmsg("checkpoints are occurring too frequently (%d seconds apart)", + elapsed_secs), + errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); + + /* + * Initialize bgwriter-private variables used during checkpoint. + */ + ckpt_active = true; + ckpt_start_recptr = GetInsertRecPtr(); + ckpt_start_time = now; + ckpt_cached_elapsed = 0; + + /* + * Do the checkpoint. + */ + CreateCheckPoint(flags); + + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + /* + * Indicate checkpoint completion to any waiting backends. + */ + SpinLockAcquire(&bgs->ckpt_lck); + bgs->ckpt_done = bgs->ckpt_started; + SpinLockRelease(&bgs->ckpt_lck); + + ckpt_active = false; + + /* + * Note we record the checkpoint start time not end time as + * last_checkpoint_time. This is so that time-driven checkpoints + * happen at a predictable spacing. + */ + last_checkpoint_time = now; + } + else + BgBufferSync(); - ckpt_active = false; + /* Check for archive_timeout and switch xlog files if necessary. */ + CheckArchiveTimeout(); - /* - * Note we record the checkpoint start time not end time as - * last_checkpoint_time. This is so that time-driven checkpoints - * happen at a predictable spacing. - */ - last_checkpoint_time = now; + /* Nap for the configured time. */ + BgWriterNap(); } - else - BgBufferSync(); - - /* Check for archive_timeout and switch xlog files if necessary. */ - CheckArchiveTimeout(); - - /* Nap for the configured time. */ - BgWriterNap(); } } @@ -586,7 +682,8 @@ BgWriterNap(void) (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); - AbsorbFsyncRequests(); + if (!IsRecoveryProcessingMode()) + AbsorbFsyncRequests(); udelay -= 1000000L; } @@ -640,6 +737,19 @@ CheckpointWriteDelay(int flags, double progress) if (!am_bg_writer) return; + /* Perform minimal duties during recovery and skip wait if requested */ + if (IsRecoveryProcessingMode()) + { + BgBufferSync(); + + if (!shutdown_requested && + !checkpoint_requested && + IsCheckpointOnSchedule(progress)) + BgWriterNap(); + + return; + } + /* * Perform the usual bgwriter duties and take a nap, unless we're behind * schedule, in which case we just try to catch up as quickly as possible. @@ -714,16 +824,19 @@ IsCheckpointOnSchedule(double progress) * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ - recptr = GetInsertRecPtr(); - elapsed_xlogs = - (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + - ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / - CheckPointSegments; - - if (progress < elapsed_xlogs) + if (!IsRecoveryProcessingMode()) { - ckpt_cached_elapsed = elapsed_xlogs; - return false; + recptr = GetInsertRecPtr(); + elapsed_xlogs = + (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + + ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / + CheckPointSegments; + + if (progress < elapsed_xlogs) + { + ckpt_cached_elapsed = elapsed_xlogs; + return false; + } } /* @@ -988,6 +1101,77 @@ RequestCheckpoint(int flags) } } +/* + * Always runs in Startup process (see xlog.c) + */ +void +RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter) +{ + /* + * Should we just do it ourselves? + */ + if (!IsPostmasterEnvironment || !sendToBGWriter) + { + CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE); + return; + } + + /* + * Push requested values into shared memory, then signal to request restartpoint. + */ + if (BgWriterShmem->bgwriter_pid == 0) + elog(LOG, "could not request restartpoint because bgwriter not running"); + +#ifdef NOT_USED + elog(LOG, "tli = %u nextXidEpoch = %u nextXid = %u nextOid = %u", + restartPoint->ThisTimeLineID, + restartPoint->nextXidEpoch, + restartPoint->nextXid, + restartPoint->nextOid); +#endif + + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + BgWriterShmem->ReadPtr = ReadPtr; + memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint)); + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint: %m"); +} + +/* + * Sends another checkpoint request signal to bgwriter, which causes it + * to avoid smoothed writes and continue processing as if it had been + * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery. + */ +void +RequestRestartPointCompletion(void) +{ + if (BgWriterShmem->bgwriter_pid != 0 && + kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint immediate: %m"); +} + +XLogRecPtr +GetRedoLocationForArchiveCheckpoint(void) +{ + XLogRecPtr redo; + + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + redo = BgWriterShmem->ReadPtr; + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + return redo; +} + +void +SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo) +{ + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + BgWriterShmem->ReadPtr = redo; + SpinLockRelease(&BgWriterShmem->ckpt_lck); +} + /* * ForwardFsyncRequest * Forward a file-fsync request from a backend to the bgwriter diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 3380b806f6..5cb84be4b8 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -254,6 +254,11 @@ typedef enum { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* consistent recovery mode; state only + * entered for archive and streaming recovery, + * and only after the point where the + * all data is in consistent state. + */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ @@ -1302,7 +1307,7 @@ ServerLoop(void) * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ - if (BgWriterPID == 0 && pmState == PM_RUN) + if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY)) BgWriterPID = StartBackgroundWriter(); /* @@ -2116,7 +2121,7 @@ reaper(SIGNAL_ARGS) if (pid == StartupPID) { StartupPID = 0; - Assert(pmState == PM_STARTUP); + Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) @@ -2157,11 +2162,11 @@ reaper(SIGNAL_ARGS) load_role(); /* - * Crank up the background writer. It doesn't matter if this - * fails, we'll just try again later. + * Check whether we need to start background writer, if not + * already running. */ - Assert(BgWriterPID == 0); - BgWriterPID = StartBackgroundWriter(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart @@ -3847,6 +3852,51 @@ sigusr1_handler(SIGNAL_ARGS) PG_SETMASK(&BlockSig); + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START)) + { + Assert(pmState == PM_STARTUP); + + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Startup process has entered recovery + */ + pmState = PM_RECOVERY; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process won't have recomputed this from the database yet, + * so we it may change following recovery. + */ + load_role(); + + /* + * Crank up the background writer. It doesn't matter if this + * fails, we'll just try again later. + */ + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + /* XXX at this point we could accept read-only connections */ + ereport(DEBUG1, + (errmsg("database system is in consistent recovery mode"))); + } + } + if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE)) { /* diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 62b22bd1db..a7b81e37a7 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -268,3 +268,12 @@ out (and anyone else who flushes buffer contents to disk must do so too). This ensures that the page image transferred to disk is reasonably consistent. We might miss a hint-bit update or two but that isn't a problem, for the same reasons mentioned under buffer access rules. + +As of 8.4, background writer starts during recovery mode when there is +some form of potentially extended recovery to perform. It performs an +identical service to normal processing, except that checkpoints it +writes are technically restartpoints. Flushing outstanding WAL for dirty +buffers is also skipped, though there shouldn't ever be new WAL entries +at that time in any case. We could choose to start background writer +immediately but we hold off until we can prove the database is in a +consistent state so that postmaster has a single, clean state change. diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 4ea849d7f1..3bba50ab83 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -197,6 +197,9 @@ main(int argc, char *argv[]) printf(_("Minimum recovery ending location: %X/%X\n"), ControlFile.minRecoveryPoint.xlogid, ControlFile.minRecoveryPoint.xrecoff); + printf(_("Minimum safe starting location: %X/%X\n"), + ControlFile.minSafeStartPoint.xlogid, + ControlFile.minSafeStartPoint.xrecoff); printf(_("Maximum data alignment: %u\n"), ControlFile.maxAlign); /* we don't print floatFormat since can't say much useful about it */ diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index 51cdde1145..b20d4bd4dd 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -603,6 +603,8 @@ RewriteControlFile(void) ControlFile.prevCheckPoint.xrecoff = 0; ControlFile.minRecoveryPoint.xlogid = 0; ControlFile.minRecoveryPoint.xrecoff = 0; + ControlFile.minSafeStartPoint.xlogid = 0; + ControlFile.minSafeStartPoint.xrecoff = 0; /* Now we can force the recorded xlog seg size to the right thing. */ ControlFile.xlog_seg_size = XLogSegSize; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 6913f7c800..cf787c8df6 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -133,7 +133,16 @@ typedef struct XLogRecData } XLogRecData; extern TimeLineID ThisTimeLineID; /* current TLI */ -extern bool InRecovery; + +/* + * Prior to 8.4, all activity during recovery were carried out by Startup + * process. This local variable continues to be used in many parts of the + * code to indicate actions taken by RecoveryManagers. Other processes who + * potentially perform work during recovery should check + * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c + */ +extern bool InRecovery; + extern XLogRecPtr XactLastRecEnd; /* these variables are GUC parameters related to XLOG */ @@ -166,6 +175,7 @@ extern bool XLOG_DEBUG; /* These indicate the cause of a checkpoint request */ #define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */ #define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */ +#define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */ /* Checkpoint statistics */ typedef struct CheckpointStatsData @@ -199,6 +209,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup); extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); +extern bool IsRecoveryProcessingMode(void); + extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 5675bfbcbd..4830a5ce74 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -17,6 +17,7 @@ #define XLOG_INTERNAL_H #include "access/xlog.h" +#include "catalog/pg_control.h" #include "fmgr.h" #include "pgtime.h" #include "storage/block.h" @@ -245,6 +246,9 @@ extern const RmgrData RmgrTable[]; extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); +extern void CreateRestartPoint(const XLogRecPtr ReadPtr, + const CheckPoint *restartPoint, int flags); + /* * These aren't in xlog.h because I'd rather not include fmgr.h there. */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 400f32c749..e69c8ec553 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -21,7 +21,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 843 +#define PG_CONTROL_VERSION 847 /* * Body of CheckPoint XLOG records. This is declared here because we keep @@ -46,7 +46,7 @@ typedef struct CheckPoint #define XLOG_NOOP 0x20 #define XLOG_NEXTOID 0x30 #define XLOG_SWITCH 0x40 - +#define XLOG_RECOVERY_END 0x50 /* System status indicator */ typedef enum DBState @@ -102,6 +102,7 @@ typedef struct ControlFileData CheckPoint checkPointCopy; /* copy of last check point record */ XLogRecPtr minRecoveryPoint; /* must replay xlog to here */ + XLogRecPtr minSafeStartPoint; /* safe point after recovery crashes */ /* * This data is used to check for hardware-architecture compatibility of diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index c1c9d7f580..d4b389e927 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -12,6 +12,7 @@ #ifndef _BGWRITER_H #define _BGWRITER_H +#include "catalog/pg_control.h" #include "storage/block.h" #include "storage/relfilenode.h" @@ -25,6 +26,11 @@ extern double CheckPointCompletionTarget; extern void BackgroundWriterMain(void); extern void RequestCheckpoint(int flags); +extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter); +extern void RequestRestartPointCompletion(void); +extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void); +extern void SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo); + extern void CheckpointWriteDelay(int flags, double progress); extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 3101092cbd..1904187bfd 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -22,6 +22,7 @@ */ typedef enum { + PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */ PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */ PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ -- 2.39.5