</para>
</note>
+ <note>
+ <para>
+ watchdogが有効な場合、不正なプライマリの切り離しは、リーダーwatchdogでのみ実施されます。
+ たとえ<xref linkend="guc-failover-require-consensus">がオンでも、この機能によるプライマリの切り離しは、リーダーwatchdogの判断だけで行われ、多数派の承認は必要とされません。
+ </para>
+ </note>
<para>
<!--
This parameter is only applicable in the streaming replication mode.
</para>
</note>
+ <note>
+ <para>
+ if watchdog is enabled, detaching false primary is only done by
+ leader watchdog node. Even
+ if <xref linkend="guc-failover-require-consensus"> is on,
+ detaching the false primary being performed is solely by the
+ judgment of the leader watchdog. No majority consensus is
+ required.
+ </para>
+ </note>
+
<para>
This parameter is only applicable in the streaming replication mode.
</para>
* pgpool: a language independent connection pool server for PostgreSQL
* written by Tatsuo Ishii
*
- * Copyright (c) 2003-2023 PgPool Global Development Group
+ * Copyright (c) 2003-2024 PgPool Global Development Group
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose and without fee is hereby
{
MemoryContextSwitchTo(WorkerMemoryContext);
MemoryContextResetAndDeleteChildren(WorkerMemoryContext);
- WD_STATES wd_status;
+ bool watchdog_leader; /* true if I am the watchdog leader */
+
CHECK_REQUEST;
*/
if (pool_config->use_watchdog)
{
+ WD_STATES wd_status;
+ WDPGBackendStatus *backendStatus;
+
wd_status = wd_internal_get_watchdog_local_node_state();
ereport(DEBUG1,
(errmsg("watchdog status: %d", wd_status)));
+ /*
+ * Ask the watchdog to get all the backend states from the
+ * Leader/Coordinator Pgpool-II node.
+ */
+ watchdog_leader = false;
+ backendStatus = get_pg_backend_status_from_leader_wd_node();
+
+ if (!backendStatus)
+ /*
+ * Couldn't get leader status.
+ */
+ watchdog_leader = false;
+ else
+ {
+ int quorum = wd_internal_get_watchdog_quorum_state();
+ int node_count = backendStatus->node_count;
+
+ ereport(DEBUG1,
+ (errmsg("quorum: %d node_count: %d",
+ quorum, node_count)));
+ if (quorum >= 0 && backendStatus->node_count <= 0)
+ {
+ /*
+ * Quorum exists and node_count <= 0.
+ * Definitely I am the leader.
+ */
+ watchdog_leader = true;
+ }
+ else
+ watchdog_leader = false;
+
+ pfree(backendStatus);
+ }
}
/*
*/
if (pool_config->detach_false_primary)
{
- n = i;
- degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER);
+ /*
+ * However if watchdog is enabled and I am not
+ * the leader, do not detach the invalid node
+ * because the information to determine the
+ * false primary might be outdated or
+ * temporarily inconsistent. See
+ * [pgpool-hackers: 4431] for more details.
+ */
+ if (!pool_config->use_watchdog ||
+ (pool_config->use_watchdog && watchdog_leader))
+ {
+ n = i;
+ /*
+ * In the case watchdog enabled, we need
+ * to add REQ_DETAIL_CONFIRMED, which
+ * means no quorum consensus is
+ * required. If we do not add this, the
+ * target node will remain quarantine
+ * state since other node does not request
+ * failover.
+ */
+ degenerate_backend_set(&n, 1,
+ REQ_DETAIL_SWITCHOVER|REQ_DETAIL_CONFIRMED);
+ }
+ else if (pool_config->use_watchdog)
+ ereport(LOG,
+ (errmsg("do not detach invalid node %d because I am not the leader or quorum does not exist", i)));
}
}
}
--- /dev/null
+#!/usr/bin/env bash
+#-------------------------------------------------------------------
+# test script for that detach_false_primary could bring down all backends.
+# See [pgpool-hackers: 4431] for more details.
+#
+# It is possible that all DB nodes go down if detach_false_primary is enabled.
+# Story:
+# There are 3 watchdog nodes pgpool0, pgpool1 and pgpool2.
+# There are 2 DB nodes node0 and node1 (initially node 0 is primary).
+# follow_primary_command is disabled.
+# 1) Node 0 goes down at pgpool0 due to a network trouble. BUT actually
+# node 0 is alive.
+# 2) Node 0 goes down at pgpool1 due to a network trouble. BUT actually
+# node 0 is alive.
+# 3) Failover is triggered. Since pgpool0 and pgpool1 agree, node 0 is set to down.
+# node 1 is promoted.
+# 4) Before new status is synched with pgpool2, pgpool2's sr_check
+# finds that there are two primary nodes due to
+# #3. detach_false_primary is triggered and node 1 goes down.
+# 5) Now all backends are in down status.
+
+# wait for watchdog starting up by looking for "lifecheck started" in
+# the pgpool.log. argument: $log: absolute path to the pgpool.log.
+function wait_for_watchdog_startup
+{
+ while :
+ do
+ grep "lifecheck started" $log >/dev/null
+ if [ $? = 0 ];then
+ break;
+ fi
+ sleep 1
+ done
+}
+
+source $TESTLIBS
+TESTDIR=testdir
+PSQL=$PGBIN/psql
+PG_CTL=$PGBIN/pg_ctl
+export PGDATABASE=test
+
+rm -fr $TESTDIR
+mkdir $TESTDIR
+cd $TESTDIR
+
+version=`$PSQL --version|awk '{print $3}'`
+result=`echo "$version >= 9.6"|bc`
+if [ $result = 0 ];then
+ echo "PostgreSQL version $version is 9.5 or before. Skipping test."
+ exit 0
+fi
+
+# create 3 node pgpool with 2 backends.
+$WATCHDOG_SETUP -wn 3 -n 2
+
+# enable detach_false_primary and health_check_test. We need to
+# disable follow_primary_command, othewise node 0 goes down by
+# follow_primary_command and the test may not reveals the problem.
+# Also we set sr_check_period to very short so that
+# detach_false_primary is triggered before the new status is synched
+# by watchdog leader.
+for i in 0 1 2
+do
+ echo "detach_false_primary = on" >> pgpool$i/etc/pgpool.conf
+ echo "health_check_test = on" >> pgpool$i/etc/pgpool.conf
+ echo "follow_primary_command = ''" >> pgpool$i/etc/pgpool.conf
+ echo "sr_check_period = 1" >> pgpool$i/etc/pgpool.conf
+done
+
+cd pgpool0
+source ./bashrc.ports
+cd ..
+
+./startall
+
+echo -n "waiting for watchdog node 0 starting up... "
+log=pgpool0/log/pgpool.log
+wait_for_watchdog_startup $log
+echo "done."
+
+$PGPOOL_INSTALL_DIR/bin/pcp_watchdog_info -v -w -h localhost -p $PCP_PORT
+$PGPOOL_INSTALL_DIR/bin/pcp_node_info -h localhost -p $PCP_PORT
+
+# Let node 0 down at pgpool0
+echo "0 down" > pgpool0/log/backend_down_request
+# Let node 0 down at pgpool1
+echo "0 down" > pgpool1/log/backend_down_request
+
+# Wait up to 30 seconds before the problem (all nodes go down).
+# Observe that pgpool1 and pgpool2 print:
+# LOG: pgpool_worker_child: invalid node found 1
+# which means sr_check ran detach_false_primary but did not trigger failover:
+# LOG: do not detach invalid node 1 because I am not the leader or quorum does not exist
+for t in {1..30}
+do
+ for i in 0 1 2
+ do
+ date
+ echo "node info after failover at pgppol$i"
+ cd pgpool$i
+ source ./bashrc.ports
+ cd ..
+ $PGPOOL_INSTALL_DIR/bin/pcp_node_info -h localhost -p $PCP_PORT
+ done
+ # check whether all node down.
+ n0=`$PGPOOL_INSTALL_DIR/bin/pcp_node_info -h localhost -p $PCP_PORT 0|awk '{print $5}'`
+ n1=`$PGPOOL_INSTALL_DIR/bin/pcp_node_info -h localhost -p $PCP_PORT 1|awk '{print $5}'`
+ if [ $n0 = "down" -a $n1 = "down" ];then
+ echo "all nodes go down."
+ ./shutdownall
+ exit 1
+ fi
+ sleep 1
+done
+echo "test succeeded."
+
+./shutdownall
+
+exit 0