Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is

promoted to FATAL) end in exit(1) not exit(0). Then change the postmaster to allow exit(1) without a system-wide panic, but not for the startup subprocess or the bgwriter. There were a couple of places that were using exit(1) to deliberately force a system-wide panic; adjust these to be exit(2) instead. This fixes the problem noted back in July that if the startup process exits with elog(ERROR), the postmaster would think everything is hunky-dory and proceed to start up. Alternative solutions such as trying to run the entire startup process as a critical section seem less clean, primarily because of the fact that a fair amount of startup code is shared by all postmaster children in the EXEC_BACKEND case. We'd need an ugly special case somewhere near the head of main.c to make it work if it's the child process's responsibility to determine what happens; and what's the point when the postmaster already treats different children differently?
author: Tom Lane 2006-11-21 00:49:55 +0000
committer: Tom Lane 2006-11-21 00:49:55 +0000
commit: e82d9e6283d6ca19d1ea7547e7e9ae8399471e1a (patch)
tree: b6f413821ac86b98dad1dacc461dd6a70a570696
parent: 778bb7b60d8b36029f9e874a1b38428115bef0d1 (diff)
5 files changed, 39 insertions, 40 deletions
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 70b9172cd7f..b44c501cc69 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.225 2006/10/04 00:29:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.226 2006/11/21 00:49:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -421,15 +421,8 @@ BootstrapMain(int argc, char *argv[])
 		case BS_XLOG_STARTUP:
 			bootstrap_signals();
 			StartupXLOG();
-
-			/*
-			 * These next two functions don't consider themselves critical,
-			 * but we'd best PANIC anyway if they fail.
-			 */
-			START_CRIT_SECTION();
 			LoadFreeSpaceMap();
 			BuildFlatFiles(false);
-			END_CRIT_SECTION();
 			proc_exit(0);		/* startup done */
 
 		case BS_XLOG_BGWRITER:
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index e3a4925b911..f2cb3ff68de 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.29 2006/10/06 17:13:59 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.30 2006/11/21 00:49:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -503,12 +503,12 @@ bg_quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }
 
 /* SIGHUP: set flag to re-read config file at next convenient time */
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index caf7d9a82d4..9d7b6065f46 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.501 2006/11/05 22:42:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.502 2006/11/21 00:49:55 tgl Exp $
  *
  * NOTES
  *
@@ -358,6 +358,10 @@ static void ShmemBackendArrayRemove(pid_t pid);
 #define StartupDataBase()		StartChildProcess(BS_XLOG_STARTUP)
 #define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER)
 
+/* Macros to check exit status of a child process */
+#define EXIT_STATUS_0(st)  ((st) == 0)
+#define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
+
 
 /*
  * Postmaster main entry point
@@ -2025,7 +2029,8 @@ reaper(SIGNAL_ARGS)
 		if (StartupPID != 0 && pid == StartupPID)
 		{
 			StartupPID = 0;
-			if (exitstatus != 0)
+			/* Note: FATAL exit of startup is treated as catastrophic */
+			if (!EXIT_STATUS_0(exitstatus))
 			{
 				LogChildExit(LOG, _("startup process"),
 							 pid, exitstatus);
@@ -2078,7 +2083,8 @@ reaper(SIGNAL_ARGS)
 		if (BgWriterPID != 0 && pid == BgWriterPID)
 		{
 			BgWriterPID = 0;
-			if (exitstatus == 0 && Shutdown > NoShutdown && !FatalError &&
+			if (EXIT_STATUS_0(exitstatus) &&
+				Shutdown > NoShutdown && !FatalError &&
 				!DLGetHead(BackendList) && AutoVacPID == 0)
 			{
 				/*
@@ -2096,7 +2102,8 @@ reaper(SIGNAL_ARGS)
 			}
 
 			/*
-			 * Any unexpected exit of the bgwriter is treated as a crash.
+			 * Any unexpected exit of the bgwriter (including FATAL exit)
+			 * is treated as a crash.
 			 */
 			HandleChildCrash(pid, exitstatus,
 							 _("background writer process"));
@@ -2104,15 +2111,16 @@ reaper(SIGNAL_ARGS)
 		}
 
 		/*
-		 * Was it the autovacuum process?  Normal exit can be ignored; we'll
-		 * start a new one at the next iteration of the postmaster's main
-		 * loop, if necessary.  An unexpected exit is treated as a crash.
+		 * Was it the autovacuum process?  Normal or FATAL exit can be
+		 * ignored; we'll start a new one at the next iteration of the
+		 * postmaster's main loop, if necessary.  Any other exit condition
+		 * is treated as a crash.
 		 */
 		if (AutoVacPID != 0 && pid == AutoVacPID)
 		{
 			AutoVacPID = 0;
 			autovac_stopped();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 				HandleChildCrash(pid, exitstatus,
 								 _("autovacuum process"));
 			continue;
@@ -2126,7 +2134,7 @@ reaper(SIGNAL_ARGS)
 		if (PgArchPID != 0 && pid == PgArchPID)
 		{
 			PgArchPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("archiver process"),
 							 pid, exitstatus);
 			if (XLogArchivingActive() &&
@@ -2143,7 +2151,7 @@ reaper(SIGNAL_ARGS)
 		if (PgStatPID != 0 && pid == PgStatPID)
 		{
 			PgStatPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("statistics collector process"),
 							 pid, exitstatus);
 			if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
@@ -2157,7 +2165,7 @@ reaper(SIGNAL_ARGS)
 			SysLoggerPID = 0;
 			/* for safety's sake, launch new logger *first* */
 			SysLoggerPID = SysLogger_Start();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("system logger process"),
 							 pid, exitstatus);
 			continue;
@@ -2229,12 +2237,12 @@ CleanupBackend(int pid,
 	LogChildExit(DEBUG2, _("server process"), pid, exitstatus);
 
 	/*
-	 * If a backend dies in an ugly way (i.e. exit status not 0) then we must
-	 * signal all other backends to quickdie.  If exit status is zero we
-	 * assume everything is hunky dory and simply remove the backend from the
+	 * If a backend dies in an ugly way then we must signal all other backends
+	 * to quickdie.  If exit status is zero (normal) or one (FATAL exit), we
+	 * assume everything is all right and simply remove the backend from the
 	 * active backend list.
 	 */
-	if (exitstatus != 0)
+	if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 	{
 		HandleChildCrash(pid, exitstatus, _("server process"));
 		return;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index d019864fea4..8b014c887f5 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.516 2006/10/19 19:52:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.517 2006/11/21 00:49:55 tgl Exp $
  *
  * NOTES
  *	  this is the "main" module of the postgres backend and
@@ -2327,12 +2327,12 @@ quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }
 
 /*
@@ -2374,7 +2374,7 @@ die(SIGNAL_ARGS)
 
 /*
  * Timeout or shutdown signal from postmaster during client authentication.
- * Simply exit(0).
+ * Simply exit(1).
  *
  * XXX: possible future improvement: try to send a message indicating
  * why we are disconnecting.  Problem is to be sure we don't block while
@@ -2383,7 +2383,7 @@ die(SIGNAL_ARGS)
 void
 authdie(SIGNAL_ARGS)
 {
-	exit(0);
+	exit(1);
 }
 
 /*
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 61c413b4be3..6b6a9c51cf7 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -42,7 +42,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.175 2006/10/01 22:08:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.176 2006/11/21 00:49:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -421,25 +421,23 @@ errfinish(int dummy,...)
 		 * fflush here is just to improve the odds that we get to see the
 		 * error message, in case things are so hosed that proc_exit crashes.
 		 * Any other code you might be tempted to add here should probably be
-		 * in an on_proc_exit callback instead.
+		 * in an on_proc_exit or on_shmem_exit callback instead.
 		 */
 		fflush(stdout);
 		fflush(stderr);
 
 		/*
-		 * If proc_exit is already running, we exit with nonzero exit code to
-		 * indicate that something's pretty wrong.  We also want to exit with
-		 * nonzero exit code if not running under the postmaster (for example,
-		 * if we are being run from the initdb script, we'd better return an
-		 * error status).
+		 * Do normal process-exit cleanup, then return exit code 1 to indicate
+		 * FATAL termination.  The postmaster may or may not consider this
+		 * worthy of panic, depending on which subprocess returns it.
 		 */
-		proc_exit(proc_exit_inprogress || !IsUnderPostmaster);
+		proc_exit(1);
 	}
 
 	if (elevel >= PANIC)
 	{
 		/*
-		 * Serious crash time. Postmaster will observe nonzero process exit
+		 * Serious crash time. Postmaster will observe SIGABRT process exit
 		 * status and kill the other backends too.
 		 *
 		 * XXX: what if we are *in* the postmaster?  abort() won't kill our
author	Tom Lane	2006-11-21 00:49:55 +0000
committer	Tom Lane	2006-11-21 00:49:55 +0000
commit	e82d9e6283d6ca19d1ea7547e7e9ae8399471e1a (patch)
tree	b6f413821ac86b98dad1dacc461dd6a70a570696
parent	778bb7b60d8b36029f9e874a1b38428115bef0d1 (diff)