Add GUC to control the time to wait before retrieving WAL after failed attempt.
authorFujii Masao <fujii@postgresql.org>
Mon, 23 Feb 2015 11:55:17 +0000 (20:55 +0900)
committerFujii Masao <fujii@postgresql.org>
Mon, 23 Feb 2015 11:55:17 +0000 (20:55 +0900)
Previously when the standby server failed to retrieve WAL files from any sources
(i.e., streaming replication, local pg_xlog directory or WAL archive), it always
waited for five seconds (hard-coded) before the next attempt. For example,
this is problematic in warm-standby because restore_command can fail
every five seconds even while new WAL file is expected to be unavailable for
a long time and flood the log files with its error messages.

This commit adds new parameter, wal_retrieve_retry_interval, to control that
wait time.

Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.

doc/src/sgml/config.sgml
src/backend/access/transam/xlog.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/access/xlog.h

index 6bcb106518e4eefe33c9b69129d737f93a1646be..a3917aac7855bc3c71f7e6bb6b3c78078d981314 100644 (file)
@@ -2985,6 +2985,24 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wal-retrieve-retry-interval" xreflabel="wal_retrieve_retry_interval">
+      <term><varname>wal_retrieve_retry_interval</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wal_retrieve_retry_interval</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specify how long the standby server should wait when WAL data is not
+        available from any sources (streaming replication,
+        local <filename>pg_xlog</> or WAL archive) before retrying to
+        retrieve WAL data.  This parameter can only be set in the
+        <filename>postgresql.conf</> file or on the server command line.
+        The default value is 5 seconds. Units are milliseconds if not specified.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
     </sect2>
    </sect1>
index 629a457965ff2e6987bd6ccd5ee0cabb26fe212c..f68f82b255c0608d7a7f958865a2abe3d179911c 100644 (file)
@@ -93,6 +93,7 @@ int           sync_method = DEFAULT_SYNC_METHOD;
 int            wal_level = WAL_LEVEL_MINIMAL;
 int            CommitDelay = 0;    /* precommit delay in microseconds */
 int            CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+int            wal_retrieve_retry_interval = 5000;
 
 #ifdef WAL_DEBUG
 bool       XLOG_DEBUG = false;
@@ -10340,8 +10341,8 @@ static bool
 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                            bool fetching_ckpt, XLogRecPtr tliRecPtr)
 {
-   static pg_time_t last_fail_time = 0;
-   pg_time_t   now;
+   static TimestampTz  last_fail_time = 0;
+   TimestampTz now;
 
    /*-------
     * Standby mode is implemented by a state machine:
@@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
     * 2. Check trigger file
     * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
     * 4. Rescan timelines
-    * 5. Sleep seconds, and loop back to 1.
+    * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
     *
     * Failure to read from the current source advances the state machine to
     * the next state.
@@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                     * machine, so we've exhausted all the options for
                     * obtaining the requested WAL. We're going to loop back
                     * and retry from the archive, but if it hasn't been long
-                    * since last attempt, sleep 5 seconds to avoid
-                    * busy-waiting.
+                    * since last attempt, sleep wal_retrieve_retry_interval
+                    * milliseconds to avoid busy-waiting.
                     */
-                   now = (pg_time_t) time(NULL);
-                   if ((now - last_fail_time) < 5)
+                   now = GetCurrentTimestamp();
+                   if (!TimestampDifferenceExceeds(last_fail_time, now,
+                                                   wal_retrieve_retry_interval))
                    {
-                       pg_usleep(1000000L * (5 - (now - last_fail_time)));
-                       now = (pg_time_t) time(NULL);
+                       long        secs, wait_time;
+                       int         usecs;
+
+                       TimestampDifference(last_fail_time, now, &secs, &usecs);
+                       wait_time = wal_retrieve_retry_interval -
+                           (secs * 1000 + usecs / 1000);
+
+                       WaitLatch(&XLogCtl->recoveryWakeupLatch,
+                                 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                 wait_time);
+                       ResetLatch(&XLogCtl->recoveryWakeupLatch);
+                       now = GetCurrentTimestamp();
                    }
                    last_fail_time = now;
                    currentSource = XLOG_FROM_ARCHIVE;
@@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                    }
 
                    /*
-                    * Wait for more WAL to arrive. Time out after 5 seconds,
-                    * like when polling the archive, to react to a trigger
-                    * file promptly.
+                    * Wait for more WAL to arrive. Time out after 5 seconds
+                    * to react to a trigger file promptly.
                     */
                    WaitLatch(&XLogCtl->recoveryWakeupLatch,
-                             WL_LATCH_SET | WL_TIMEOUT,
+                             WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
                              5000L);
                    ResetLatch(&XLogCtl->recoveryWakeupLatch);
                    break;
index 95727776d3851a2d1a55ca3d3fa824a7cf492bbc..cf401d3cf03ecbb30987151ae0e8784e14033a12 100644 (file)
@@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY,
+           gettext_noop("Sets the time to wait before retrying to retrieve WAL"
+                        "after a failed attempt."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &wal_retrieve_retry_interval,
+       5000, 1, INT_MAX,
+       NULL, NULL, NULL
+   },
+
    {
        {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS,
            gettext_noop("Shows the number of pages per write ahead log segment."),
index b053659f88e85a2aff7b2ddcbb12b0ca4b2897d0..29d8485964d696cccc0d45e63c1644485973b1f1 100644 (file)
 #wal_receiver_timeout = 60s        # time that receiver waits for
                    # communication from master
                    # in milliseconds; 0 disables
+#wal_retrieve_retry_interval = 5s  # time to wait before retrying to
+                   # retrieve WAL after a failed attempt
 
 
 #------------------------------------------------------------------------------
index 138deaf7c8fa393507c59eec49e3b2cbee67d5a3..be27a85648665ba7aaa9f4d61c516feaf7dcb675 100644 (file)
@@ -93,6 +93,7 @@ extern int    CheckPointSegments;
 extern int wal_keep_segments;
 extern int XLOGbuffers;
 extern int XLogArchiveTimeout;
+extern int wal_retrieve_retry_interval;
 extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
 extern bool EnableHotStandby;