Add more LOG messages when starting and ending recovery from a backup
authorMichael Paquier <michael@paquier.xyz>
Thu, 25 Jan 2024 08:07:56 +0000 (17:07 +0900)
committerMichael Paquier <michael@paquier.xyz>
Thu, 25 Jan 2024 08:07:56 +0000 (17:07 +0900)
Three LOG messages are added in the recovery code paths, providing
information that can be useful to track corruption issues depending on
the state of the cluster, telling that:
- Recovery has started from a backup_label.
- Recovery is restarting from a backup start LSN, without a
backup_label.
- Recovery has completed from a backup.

Author: Andres Freund
Reviewed-by: David Steele, Laurenz Albe, Michael Paquier
Discussion: https://postgr.es/m/20231117041811.vz4vgkthwjnwp2pp@awork3.anarazel.de

src/backend/access/transam/xlogrecovery.c

index 1b48d7171a49e64bc1b9fec6b24cc520b4c4ff16..0bb472da278407e5a1cba6e4731f1567ef59ba3e 100644 (file)
@@ -603,6 +603,22 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
                if (StandbyModeRequested)
                        EnableStandbyMode();
 
+               /*
+                * Omitting backup_label when creating a new replica, PITR node etc.
+                * unfortunately is a common cause of corruption.  Logging that
+                * backup_label was used makes it a bit easier to exclude that as the
+                * cause of observed corruption.
+                *
+                * Do so before we try to read the checkpoint record (which can fail),
+                * as otherwise it can be hard to understand why a checkpoint other
+                * than ControlFile->checkPoint is used.
+                */
+               ereport(LOG,
+                               (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
+                                               LSN_FORMAT_ARGS(RedoStartLSN),
+                                               LSN_FORMAT_ARGS(CheckPointLoc),
+                                               CheckPointTLI)));
+
                /*
                 * When a backup_label file is present, we want to roll forward from
                 * the checkpoint it identifies, rather than using pg_control.
@@ -742,6 +758,16 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
                                EnableStandbyMode();
                }
 
+               /*
+                * For the same reason as when starting up with backup_label present,
+                * emit a log message when we continue initializing from a base
+                * backup.
+                */
+               if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+                       ereport(LOG,
+                                       (errmsg("restarting backup recovery with redo LSN %X/%X",
+                                                       LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
+
                /* Get the last valid checkpoint record. */
                CheckPointLoc = ControlFile->checkPoint;
                CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
@@ -2155,6 +2181,9 @@ CheckRecoveryConsistency(void)
        if (!XLogRecPtrIsInvalid(backupEndPoint) &&
                backupEndPoint <= lastReplayedEndRecPtr)
        {
+               XLogRecPtr      saveBackupStartPoint = backupStartPoint;
+               XLogRecPtr      saveBackupEndPoint = backupEndPoint;
+
                elog(DEBUG1, "end of backup reached");
 
                /*
@@ -2165,6 +2194,11 @@ CheckRecoveryConsistency(void)
                backupStartPoint = InvalidXLogRecPtr;
                backupEndPoint = InvalidXLogRecPtr;
                backupEndRequired = false;
+
+               ereport(LOG,
+                               (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
+                                               LSN_FORMAT_ARGS(saveBackupStartPoint),
+                                               LSN_FORMAT_ARGS(saveBackupEndPoint))));
        }
 
        /*