Refactor, so that the system isn't opened for business until startup process
authorHeikki Linnakangas <heikki@enterprisedb.com>
Fri, 20 Feb 2009 10:11:48 +0000 (12:11 +0200)
committerHeikki Linnakangas <heikki@enterprisedb.com>
Fri, 20 Feb 2009 11:30:24 +0000 (13:30 +0200)
has died.

src/backend/access/transam/xlog.c
src/backend/postmaster/postmaster.c
src/backend/storage/ipc/ipc.c
src/backend/storage/ipc/pmsignal.c
src/include/storage/pmsignal.h

index 97fb14868a44f9c0960860506f4fb38f62195596..6c0b5f116ebe3989e52494da7faf4fcd7fe6514b 100644 (file)
@@ -7695,7 +7695,7 @@ StartupProcessMain(void)
        BuildFlatFiles(false);
 
        /* Let postmaster know that startup is finished */
-       SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
+       SetPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
 
        /* exit normally */
        proc_exit(0);
index 70d9ca246c4fee02b577782ffa57bf56ec232720..80460d357d3f0eef51bea14eb4af5e5079a35d49 100644 (file)
@@ -227,14 +227,6 @@ static int Shutdown = NoShutdown;
 static bool FatalError = false; /* T if recovering from backend crash */
 static bool RecoveryError = false; /* T if recovery failed */
 
-/* State of WAL redo */
-#define                        NoRecovery                      0
-#define                        RecoveryStarted         1
-#define                        RecoveryConsistent      2
-#define                        RecoveryCompleted       3
-
-static int     RecoveryStatus = NoRecovery;
-
 /*
  * We use a simple state machine to control startup, shutdown, and
  * crash recovery (which is rather like shutdown followed by startup).
@@ -253,9 +245,12 @@ static int RecoveryStatus = NoRecovery;
  * point, if we had the infrastructure to do that.
  *
  * When the WAL redo is finished, the startup process signals us the third
- * time, and we switch to PM_RUN state. The startup process can also skip the
- * recovery and consistent recovery phases altogether, as it will during
- * normal startup when there's no recovery to be done, for example.
+ * time, and exits. We don't process the 3d signal immediately but when we
+ * see the that the startup process has exited, we check that we have
+ * received the signal. If everything is OK, we then switch to PM_RUN state.
+ * The startup process can also skip the recovery and consistent recovery
+ * phases altogether, as it will during normal startup when there's no
+ * recovery to be done, for example.
  *
  * Normal child backends can only be launched when we are in PM_RUN state.
  * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
@@ -338,7 +333,6 @@ static void pmdie(SIGNAL_ARGS);
 static void reaper(SIGNAL_ARGS);
 static void sigusr1_handler(SIGNAL_ARGS);
 static void dummy_handler(SIGNAL_ARGS);
-static void CheckRecoverySignals(void);
 static void CleanupBackend(int pid, int exitstatus);
 static void HandleChildCrash(int pid, int exitstatus, const char *procname);
 static void LogChildExit(int lev, const char *procname,
@@ -2019,7 +2013,8 @@ pmdie(SIGNAL_ARGS)
                        ereport(LOG,
                                        (errmsg("received smart shutdown request")));
 
-                       if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
+                       if (pmState == PM_RUN || pmState == PM_RECOVERY ||
+                               pmState == PM_RECOVERY_CONSISTENT)
                        {
                                /* autovacuum workers are told to shut down immediately */
                                SignalAutovacWorkers(SIGTERM);
@@ -2159,23 +2154,24 @@ reaper(SIGNAL_ARGS)
                 */
                if (pid == StartupPID)
                {
+                       bool recoveryCompleted;
+
                        StartupPID = 0;
 
                        /*
-                        * Check if we've received a signal from the startup process
-                        * first. This can change pmState. If the startup process sends
-                        * a signal and exits immediately after that, we might not have
-                        * processed the signal yet. We need to know if it completed
-                        * recovery before it exited.
+                        * Check if the startup process completed recovery before exiting
                         */
-                       CheckRecoverySignals();
+                       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+                               recoveryCompleted = true;
+                       else
+                               recoveryCompleted = false;
 
                        /*
                         * Unexpected exit of startup process (including FATAL exit)
                         * during PM_STARTUP is treated as catastrophic. There is no
-                        * other processes running yet.
+                        * other processes running yet, so we can just exit.
                         */
-                       if (pmState == PM_STARTUP)
+                       if (pmState == PM_STARTUP && !recoveryCompleted)
                        {
                                LogChildExit(LOG, _("startup process"),
                                                         pid, exitstatus);
@@ -2195,18 +2191,65 @@ reaper(SIGNAL_ARGS)
                                                                 _("startup process"));
                                continue;
                        }
+                       /*
+                        * Startup process exited in response to a shutdown request (or
+                        * it finished normally regardless of the shutdown request).
+                        */
+                       if (Shutdown > NoShutdown)
+                       {
+                               pmState = PM_WAIT_BACKENDS;
+                               /* PostmasterStateMachine logic does the rest */
+                               continue;
+                       }
                        /*
                         * Startup process exited normally, but didn't finish recovery.
                         * This can happen if someone else than postmaster kills the
                         * startup process with SIGTERM. Treat it like a crash.
                         */
-                       if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
+                       if (!recoveryCompleted)
                        {
                                RecoveryError = true;
                                HandleChildCrash(pid, exitstatus,
                                                                 _("startup process"));
                                continue;
                        }
+
+                       /*
+                        * Startup succeeded, commence normal operations
+                        */
+                       pmState = PM_RUN;
+
+                       /*
+                        * Load the flat authorization file into postmaster's cache. The
+                        * startup process has recomputed this from the database contents,
+                        * so we wait till it finishes before loading it.
+                        */
+                       load_role();
+
+                       /*
+                        * Crank up the background writer, if we didn't do that already
+                        * when we entered consistent recovery phase.  It doesn't matter
+                        * if this fails, we'll just try again later.
+                        */
+                       if (BgWriterPID == 0)
+                               BgWriterPID = StartBackgroundWriter();
+
+                       /*
+                        * Likewise, start other special children as needed.  In a restart
+                        * situation, some of them may be alive already.
+                        */
+                       if (WalWriterPID == 0)
+                               WalWriterPID = StartWalWriter();
+                       if (AutoVacuumingActive() && AutoVacPID == 0)
+                               AutoVacPID = StartAutoVacLauncher();
+                       if (XLogArchivingActive() && PgArchPID == 0)
+                               PgArchPID = pgarch_start();
+                       if (PgStatPID == 0)
+                               PgStatPID = pgstat_start();
+
+                       /* at this point we are really open for business */
+                       ereport(LOG,
+                               (errmsg("database system is ready to accept connections")));
                }
 
                /*
@@ -2622,127 +2665,6 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 static void
 PostmasterStateMachine(void)
 {
-       /* Startup states */
-
-       if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery)
-       {
-               /* WAL redo has started. We're out of reinitialization. */
-               FatalError = false;
-
-               /*
-                * Go to shutdown mode if a shutdown request was pending.
-                */
-               if (Shutdown > NoShutdown)
-               {
-                       pmState = PM_WAIT_BACKENDS;
-                       /* PostmasterStateMachine logic does the rest */
-               }
-               else
-               {
-                       /*
-                        * Crank up the background writer.      It doesn't matter if this
-                        * fails, we'll just try again later.
-                        */
-                       Assert(BgWriterPID == 0);
-                       BgWriterPID = StartBackgroundWriter();
-
-                       pmState = PM_RECOVERY;
-               }
-       }
-       if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent)
-       {
-               /*
-                * Go to shutdown mode if a shutdown request was pending.
-                */
-               if (Shutdown > NoShutdown)
-               {
-                       pmState = PM_WAIT_BACKENDS;
-                       /* PostmasterStateMachine logic does the rest */
-               }
-               else
-               {
-                       /*
-                        * Startup process has entered recovery. We consider that good
-                        * enough to reset FatalError.
-                        */
-                       pmState = PM_RECOVERY_CONSISTENT;
-
-                       /*
-                        * Load the flat authorization file into postmaster's cache. The
-                        * startup process won't have recomputed this from the database yet,
-                        * so we it may change following recovery. 
-                        */
-                       load_role();
-
-                       /*
-                        * Likewise, start other special children as needed.
-                        */
-                       Assert(PgStatPID == 0);
-                       PgStatPID = pgstat_start();
-
-                       /* XXX at this point we could accept read-only connections */
-                       ereport(DEBUG1,
-                                (errmsg("database system is in consistent recovery mode")));
-               }
-       }
-       if ((pmState == PM_RECOVERY || 
-                pmState == PM_RECOVERY_CONSISTENT ||
-                pmState == PM_STARTUP) &&
-               RecoveryStatus == RecoveryCompleted)
-       {
-               /*
-                * Startup succeeded.
-                *
-                * Go to shutdown mode if a shutdown request was pending.
-                */
-               if (Shutdown > NoShutdown)
-               {
-                       pmState = PM_WAIT_BACKENDS;
-                       /* PostmasterStateMachine logic does the rest */
-               }
-               else
-               {
-                       /*
-                        * Otherwise, commence normal operations.
-                        */
-                       pmState = PM_RUN;
-
-                       /*
-                        * Load the flat authorization file into postmaster's cache. The
-                        * startup process has recomputed this from the database contents,
-                        * so we wait till it finishes before loading it.
-                        */
-                       load_role();
-
-                       /*
-                        * Crank up the background writer, if we didn't do that already
-                        * when we entered consistent recovery phase.  It doesn't matter
-                        * if this fails, we'll just try again later.
-                        */
-                       if (BgWriterPID == 0)
-                               BgWriterPID = StartBackgroundWriter();
-
-                       /*
-                        * Likewise, start other special children as needed.  In a restart
-                        * situation, some of them may be alive already.
-                        */
-                       if (WalWriterPID == 0)
-                               WalWriterPID = StartWalWriter();
-                       if (AutoVacuumingActive() && AutoVacPID == 0)
-                               AutoVacPID = StartAutoVacLauncher();
-                       if (XLogArchivingActive() && PgArchPID == 0)
-                               PgArchPID = pgarch_start();
-                       if (PgStatPID == 0)
-                               PgStatPID = pgstat_start();
-
-                       /* at this point we are really open for business */
-                       ereport(LOG,
-                               (errmsg("database system is ready to accept connections")));
-               }
-       }
-
-       /* Shutdown states */
-
        if (pmState == PM_WAIT_BACKUP)
        {
                /*
@@ -2904,8 +2826,6 @@ PostmasterStateMachine(void)
                shmem_exit(1);
                reset_shared(PostPortNumber);
 
-               RecoveryStatus = NoRecovery;
-
                StartupPID = StartupDataBase();
                Assert(StartupPID != 0);
                pmState = PM_STARTUP;
@@ -4010,47 +3930,58 @@ ExitPostmaster(int status)
 }
 
 /*
- * common code used in sigusr1_handler() and reaper() to handle
- * recovery-related signals from startup process
+ * sigusr1_handler - handle signal conditions from child processes
  */
 static void
-CheckRecoverySignals(void)
+sigusr1_handler(SIGNAL_ARGS)
 {
-       bool changed = false;
+       int                     save_errno = errno;
 
-       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED))
-       {
-               Assert(pmState == PM_STARTUP);
+       PG_SETMASK(&BlockSig);
 
-               RecoveryStatus = RecoveryStarted;
-               changed = true;
-       }
-       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT))
+       /*
+        * RECOVERY_STARTED and RECOVERY_CONSISTENT signals are ignored in
+        * unexpected states. If the startup process quickly starts up, completes
+        * recovery, exits, we might process the death of the startup process
+        * first. We don't want to go back to recovery in that case.
+        */
+       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) &&
+               pmState == PM_STARTUP)
        {
-               RecoveryStatus = RecoveryConsistent;
-               changed = true;
+               /* WAL redo has started. We're out of reinitialization. */
+               FatalError = false;
+
+               /*
+                * Crank up the background writer.      It doesn't matter if this
+                * fails, we'll just try again later.
+                */
+               Assert(BgWriterPID == 0);
+               BgWriterPID = StartBackgroundWriter();
+
+               pmState = PM_RECOVERY;
        }
-       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT) &&
+               pmState == PM_RECOVERY)
        {
-               RecoveryStatus = RecoveryCompleted;
-               changed = true;
-       }
-
-       if (changed)
-               PostmasterStateMachine();
-}
+               /*
+                * Load the flat authorization file into postmaster's cache. The
+                * startup process won't have recomputed this from the database yet,
+                * so we it may change following recovery. 
+                */
+               load_role();
 
-/*
- * sigusr1_handler - handle signal conditions from child processes
- */
-static void
-sigusr1_handler(SIGNAL_ARGS)
-{
-       int                     save_errno = errno;
+               /*
+                * Likewise, start other special children as needed.
+                */
+               Assert(PgStatPID == 0);
+               PgStatPID = pgstat_start();
 
-       PG_SETMASK(&BlockSig);
+               /* XXX at this point we could accept read-only connections */
+               ereport(DEBUG1,
+                               (errmsg("database system is in consistent recovery mode")));
 
-       CheckRecoverySignals();
+               pmState = PM_RECOVERY_CONSISTENT;
+       }
 
        if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
        {
index 13623a3c6a4706906b1b09c09ad0f4936816c0f8..50ed3e1dc82cdf088f190dbc0a37d97b81267888 100644 (file)
@@ -95,8 +95,6 @@ proc_exit(int code)
        InterruptHoldoffCount = 1;
        CritSectionCount = 0;
 
-       elog(DEBUG3, "proc_exit(%d)", code);
-
        /* do our shared memory exits first */
        shmem_exit(code);
 
@@ -161,8 +159,6 @@ proc_exit(int code)
 void
 shmem_exit(int code)
 {
-       elog(DEBUG3, "shmem_exit(%d)", code);
-
        /*
         * call all the registered callbacks.
         *
index 00bbbc7b43150e2b149af2361a42a0f093981da4..2c992c443afc738019d19f2ad5058f5c9261f462 100644 (file)
@@ -71,6 +71,23 @@ SendPostmasterSignal(PMSignalReason reason)
        kill(PostmasterPid, SIGUSR1);
 }
 
+/*
+ * SetPostmasterSignal - like SendPostmasterSignal, but don't wake up
+ *                                              postmaster
+ *
+ * This is for signals that the postmaster polls with CheckPostmasterSignal()
+ * but isn't interested in processing immediately.
+ */
+void
+SetPostmasterSignal(PMSignalReason reason)
+{
+       /* If called in a standalone backend, do nothing */
+       if (!IsUnderPostmaster)
+               return;
+       /* Atomically set the proper flag */
+       PMSignalFlags[reason] = true;
+}
+
 /*
  * CheckPostmasterSignal - check to see if a particular reason has been
  * signaled, and clear the signal flag.  Should be called by postmaster
index 21b1e90f5952a7fc359ce1dc3aa570bcc7c94430..490dd921134dfed9a528646d80a197645dd64a70 100644 (file)
@@ -39,6 +39,7 @@ typedef enum
  */
 extern void PMSignalInit(void);
 extern void SendPostmasterSignal(PMSignalReason reason);
+extern void SetPostmasterSignal(PMSignalReason reason);
 extern bool CheckPostmasterSignal(PMSignalReason reason);
 extern bool PostmasterIsAlive(bool amDirectChild);