Fix segfault in a child process.
authorTatsuo Ishii <ishii@sraoss.co.jp>
Tue, 11 Jun 2024 11:15:08 +0000 (20:15 +0900)
committerTatsuo Ishii <ishii@sraoss.co.jp>
Tue, 11 Jun 2024 11:15:08 +0000 (20:15 +0900)
It is reported that pgpool child segfaulted [1].

[snip]

In the down thread it is reported that despite VALID_BACKEND(i)
returns true, backend->slots[i] is NULL, which should have been filled
by new_connection().

It seems there's a race condition. In new_connection(), there's a code
fragment:
/*
 * Make sure that the global backend status in the shared memory
 * agrees the local status checked by VALID_BACKEND. It is possible
 * that the local status is up, while the global status has been
 * changed to down by failover.
 */
A--> if (BACKEND_INFO(i).backend_status != CON_UP &&
BACKEND_INFO(i).backend_status != CON_CONNECT_WAIT)
{
ereport(DEBUG1,
(errmsg("creating new connection to backend"),
 errdetail("skipping backend slot %d because global backend_status = %d",
   i, BACKEND_INFO(i).backend_status)));

/* sync local status with global status */
B--> *(my_backend_status[i]) = BACKEND_INFO(i).backend_status;
continue;
}

It is possible that at A backend_status in the shared memory is down
but by the time it reaches B the status has been changed to up. And
new_connection() skipped to create a backend connection. This seems to
explain why the connection slot is NULL while VALID_BACKEND returns
true. To prevent the race condtion, backend_status in shared memory is
copied to a local variable and evaluate it.  Also the VALID_BACKEND
just before:

pool_set_db_node_id(CONNECTION(backend, i), i);

is changed to:
if (VALID_BACKEND(i) && CONNECTION_SLOT(backend, i))

so that it prevents crash just in case.

[1] [pgpool-general: 9104] Another segmentation fault

src/protocol/child.c
src/protocol/pool_connection_pool.c

index c3882b13dd43ee0e98f51f7f9291113564764e75..22bfbcf20ea3db6c45f32d9729d951f043b71f44 100644 (file)
@@ -1069,9 +1069,8 @@ static POOL_CONNECTION_POOL * connect_backend(StartupPacket *sp, POOL_CONNECTION
 
                for (i = 0; i < NUM_BACKENDS; i++)
                {
-                       if (VALID_BACKEND(i))
+                       if (VALID_BACKEND(i) && CONNECTION_SLOT(backend, i))
                        {
-
                                /* set DB node id */
                                pool_set_db_node_id(CONNECTION(backend, i), i);
 
index 1fd683721cc929c4d9e484d4c1f9554aaa40ddd4..1506f413a7b8de642a3ebc3b363ed6669bb9ff5a 100644 (file)
@@ -876,6 +876,7 @@ static POOL_CONNECTION_POOL * new_connection(POOL_CONNECTION_POOL * p)
        int                     active_backend_count = 0;
        int                     i;
        bool            status_changed = false;
+       volatile BACKEND_STATUS status;
 
        MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
 
@@ -900,8 +901,8 @@ static POOL_CONNECTION_POOL * new_connection(POOL_CONNECTION_POOL * p)
                 * that the local status is up, while the global status has been
                 * changed to down by failover.
                 */
-               if (BACKEND_INFO(i).backend_status != CON_UP &&
-                       BACKEND_INFO(i).backend_status != CON_CONNECT_WAIT)
+               status = BACKEND_INFO(i).backend_status;
+               if (status != CON_UP && status != CON_CONNECT_WAIT)
                {
                        ereport(DEBUG1,
                                        (errmsg("creating new connection to backend"),
@@ -909,7 +910,7 @@ static POOL_CONNECTION_POOL * new_connection(POOL_CONNECTION_POOL * p)
                                                           i, BACKEND_INFO(i).backend_status)));
 
                        /* sync local status with global status */
-                       *(my_backend_status[i]) = BACKEND_INFO(i).backend_status;
+                       *(my_backend_status[i]) = status;
                        continue;
                }