diff options
| author | Pavan Deolasee | 2014-12-08 06:45:52 +0000 |
|---|---|---|
| committer | Pavan Deolasee | 2015-04-15 05:46:41 +0000 |
| commit | a83d995dd9513893d779fcafcec2d5a0f6c63b4b (patch) | |
| tree | 1e4a23eea4840c7502e76b16d0850d5e4a07dcbf /src | |
| parent | f3396d6eaae5b30281790f5075d4ce0a63f32495 (diff) | |
Improve GTM connecting establishment handling
We currently wait without timeout for GTM connection. This creates issues,
especially when GTM proxies are involved since GTM proxy keeps retrying
establishing connection with the GTM proper. That causes the
coordinator/backend to wait infinitely. This also prevents shutdown of those
components when GTM is down.
We now retry with a timed wait and also check for interrupts more often to
ensure that shutdown requests are honored
Diffstat (limited to 'src')
| -rw-r--r-- | src/backend/access/transam/gtm.c | 75 | ||||
| -rw-r--r-- | src/backend/postmaster/autovacuum.c | 7 |
2 files changed, 59 insertions, 23 deletions
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 3415228fa5..beddab81c2 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -68,6 +68,9 @@ InitGTM(void) { /* 256 bytes should be enough */ char conn_str[256]; +#ifdef XCP + int retry; +#endif /* If this thread is postmaster itself, it contacts gtm identifying itself */ if (!IsUnderPostmaster) @@ -79,7 +82,7 @@ InitGTM(void) else if (IS_PGXC_DATANODE) remote_type = GTM_NODE_DATANODE; - sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1", + sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=5", GtmHost, GtmPort, PGXCNodeName, remote_type); /* Log activity of GTM connections */ @@ -87,8 +90,57 @@ InitGTM(void) } else { - sprintf(conn_str, "host=%s port=%d node_name=%s", GtmHost, GtmPort, PGXCNodeName); + sprintf(conn_str, "host=%s port=%d node_name=%s connect_timeout=5", GtmHost, GtmPort, PGXCNodeName); + } + +#ifdef XCP +#define MAX_GTM_CONNECT_RETRIES 5 +#define GTM_CONNECTION_RETRY_TIMEOUT 5 + for (retry = 1; retry <= MAX_GTM_CONNECT_RETRIES; retry++) + { + CHECK_FOR_INTERRUPTS(); +#endif + conn = PQconnectGTM(conn_str); + if (GTMPQstatus(conn) != CONNECTION_OK) + { + int save_errno = errno; + + /* Write a WARNING first time */ + if (retry == 1 ) + { + ereport(WARNING, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to connect to GTM: %m (Retries %d times" + " at %d seconds )", + MAX_GTM_CONNECT_RETRIES, GTM_CONNECTION_RETRY_TIMEOUT), + errhint("Check if GTM/GTM-proxy is running @ %s:%d", + GtmHost, GtmPort))); + } + else + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to connect to GTM : %m (Giving up " + "after %d tries)", MAX_GTM_CONNECT_RETRIES), + errhint("Check if GTM/GTM-proxy is running @ %s:%d", + GtmHost, GtmPort))); + + errno = save_errno; + CloseGTM(); + + /* Sleep for 5 seconds and then retry */ + pg_usleep(5*1000000L); + } +#ifdef XCP + else if (IS_PGXC_COORDINATOR) + { + register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); + break; + } + } +#endif + if (GTMPQstatus(conn) == CONNECTION_OK) + { /* Log activity of GTM connections */ if (IsAutoVacuumWorkerProcess()) elog(DEBUG1, "Autovacuum worker: connection established to GTM with string %s", conn_str); @@ -97,25 +149,6 @@ InitGTM(void) else elog(DEBUG1, "Postmaster child: connection established to GTM with string %s", conn_str); } - - conn = PQconnectGTM(conn_str); - if (GTMPQstatus(conn) != CONNECTION_OK) - { - int save_errno = errno; - - ereport(WARNING, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("can not connect to GTM: %m"))); - - errno = save_errno; - - CloseGTM(); - } - -#ifdef XCP - else if (IS_PGXC_COORDINATOR) - register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); -#endif } void diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 6086692f81..8b5ff28e2a 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -575,7 +575,10 @@ AutoVacLauncherMain(int argc, char *argv[]) * correct because the only operation is to add autovacuum_naptime to the * entry, and time always increases). */ - rebuild_database_list(InvalidOid); +#ifdef XCP + if (!got_SIGTERM) +#endif + rebuild_database_list(InvalidOid); for (;;) { @@ -802,7 +805,7 @@ launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap) nap->tv_sec = autovacuum_naptime; nap->tv_usec = 0; } - else if ((elem = DLGetTail(DatabaseList)) != NULL) + else if ((elem = DatabaseList ? DLGetTail(DatabaseList) : NULL) != NULL) { avl_dbase *avdb = DLE_VAL(elem); TimestampTz current_time = GetCurrentTimestamp(); |
