Notify bgworker registrant after freeing worker slot.
authorRobert Haas <rhaas@postgresql.org>
Fri, 3 Mar 2017 03:44:49 +0000 (09:14 +0530)
committerRobert Haas <rhaas@postgresql.org>
Fri, 3 Mar 2017 03:55:30 +0000 (09:25 +0530)
Tom Lane observed buildfarm failures caused by the select_parallel
regression test trying to launch new parallel queries before the
worker slots used by the previous ones were freed.  Try to fix this by
having the postmaster free the worker slots before it sends the
SIGUSR1 notifications to the registering process.  This doesn't
completely eliminate the possibility that the user backend might
(correctly) observe the worker as dead before the slot is free, but I
believe it should make the window significantly narrower.

Patch by me, per complaint from Tom Lane.  Reviewed by Amit Kapila.

Discussion: http://postgr.es/m/30673.1487310734@sss.pgh.pa.us

src/backend/postmaster/bgworker.c
src/backend/postmaster/postmaster.c
src/include/postmaster/bgworker_internals.h

index cd99b0b3927f028b6f48b60b93e96b1e09c5b66c..42760b92bb1a286a87cb8b2929ca8a8bdf9c89a8 100644 (file)
@@ -429,6 +429,39 @@ ReportBackgroundWorkerPID(RegisteredBgWorker *rw)
        kill(rw->rw_worker.bgw_notify_pid, SIGUSR1);
 }
 
+/*
+ * Report that the PID of a background worker is now zero because a
+ * previously-running background worker has exited.
+ *
+ * This function should only be called from the postmaster.
+ */
+void
+ReportBackgroundWorkerExit(slist_mutable_iter *cur)
+{
+   RegisteredBgWorker *rw;
+   BackgroundWorkerSlot *slot;
+
+   rw = slist_container(RegisteredBgWorker, rw_lnode, cur->cur);
+
+   Assert(rw->rw_shmem_slot < max_worker_processes);
+   slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot];
+   slot->pid = rw->rw_pid;
+
+   /*
+    * If this worker is slated for deregistration, do that before notifying
+    * the process which started it.  Otherwise, if that process tries to
+    * reuse the slot immediately, it might not be available yet.  In theory
+    * that could happen anyway if the process checks slot->pid at just the
+    * wrong moment, but this makes the window narrower.
+    */
+   if (rw->rw_terminate ||
+       rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART)
+       ForgetBackgroundWorker(cur);
+
+   if (rw->rw_worker.bgw_notify_pid != 0)
+       kill(rw->rw_worker.bgw_notify_pid, SIGUSR1);
+}
+
 /*
  * Cancel SIGUSR1 notifications for a PID belonging to an exiting backend.
  *
index 271c492000265447eb285dedd4b363ac5b0c1c0c..2cf17ac42e00a8d95a7dd278a772c3e0e8d2fb80 100644 (file)
@@ -3050,9 +3050,9 @@ CleanupBackgroundWorker(int pid,
                        int exitstatus) /* child's exit status */
 {
    char        namebuf[MAXPGPATH];
-   slist_iter  iter;
+   slist_mutable_iter  iter;
 
-   slist_foreach(iter, &BackgroundWorkerList)
+   slist_foreach_modify(iter, &BackgroundWorkerList)
    {
        RegisteredBgWorker *rw;
 
@@ -3126,7 +3126,7 @@ CleanupBackgroundWorker(int pid,
        rw->rw_backend = NULL;
        rw->rw_pid = 0;
        rw->rw_child_slot = 0;
-       ReportBackgroundWorkerPID(rw);  /* report child death */
+       ReportBackgroundWorkerExit(&iter);  /* report child death */
 
        LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG,
                     namebuf, pid, exitstatus);
index 4c94b815b12c860d3f75524e1539d86a4d9a739f..9a2de4f4d04a991d73cce38c247501add2e5e800 100644 (file)
@@ -42,6 +42,7 @@ extern void BackgroundWorkerShmemInit(void);
 extern void BackgroundWorkerStateChange(void);
 extern void ForgetBackgroundWorker(slist_mutable_iter *cur);
 extern void ReportBackgroundWorkerPID(RegisteredBgWorker *);
+extern void ReportBackgroundWorkerExit(slist_mutable_iter *cur);
 extern void BackgroundWorkerStopNotifications(pid_t pid);
 extern void ResetBackgroundWorkerCrashTimes(void);