Don't pay heed to wal_sender_timeout while creating a decoding slot.
authorAndres Freund <andres@anarazel.de>
Wed, 28 May 2014 22:32:09 +0000 (00:32 +0200)
committerAndres Freund <andres@anarazel.de>
Wed, 28 May 2014 22:32:09 +0000 (00:32 +0200)
Sometimes CREATE_REPLICATION_SLOT ... LOGICAL ... needs to wait for
further WAL using WalSndWaitForWal(). That used to always respect
wal_sender_timeout and kill the session when waiting long enough
because no feedback/ping messages can be sent while the slot is still
being created.
Introduce the notion that last_reply_timestamp = 0 means that the
walsender currently doesn't need timeout processing to avoid that
problem. Use that notion for CREATE_REPLICATION_SLOT ... LOGICAL.

Bugreport and initial patch by Steve Singer, revised by me.

src/backend/replication/walsender.c

index eb405cb616a93cc5758381b2e2cdbfaeeb597b9a..088ee2c0976e2091b8b47aaf4279bded2a379a61 100644 (file)
@@ -148,9 +148,10 @@ static StringInfoData reply_message;
 static StringInfoData tmpbuf;
 
 /*
- * Timestamp of the last receipt of the reply from the standby.
+ * Timestamp of the last receipt of the reply from the standby. Set to 0 if
+ * wal_sender_timeout doesn't need to be active.
  */
-static TimestampTz last_reply_timestamp;
+static TimestampTz last_reply_timestamp = 0;
 
 /* Have we sent a heartbeat message asking for reply, since last reply? */
 static bool waiting_for_ping_response = false;
@@ -796,6 +797,15 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
                                        logical_read_xlog_page,
                                        WalSndPrepareWrite, WalSndWriteData);
 
+       /*
+        * Signal that we don't need the timeout mechanism. We're just
+        * creating the replication slot and don't yet accept feedback
+        * messages or send keepalives. As we possibly need to wait for
+        * further WAL the walsender would otherwise possibly be killed too
+        * soon.
+        */
+       last_reply_timestamp = 0;
+
        /* build initial snapshot, might take a while */
        DecodingContextFindStartpoint(ctx);
 
@@ -1693,7 +1703,7 @@ WalSndComputeSleeptime(TimestampTz now)
 {
    long        sleeptime = 10000;      /* 10 s */
 
-   if (wal_sender_timeout > 0)
+   if (wal_sender_timeout > 0 && last_reply_timestamp > 0)
    {
        TimestampTz wakeup_time;
        long        sec_to_timeout;
@@ -1735,6 +1745,10 @@ WalSndCheckTimeOut(TimestampTz now)
 {
    TimestampTz timeout;
 
+   /* don't bail out if we're doing something that doesn't require timeouts */
+   if (last_reply_timestamp <= 0)
+       return;
+
    timeout = TimestampTzPlusMilliseconds(last_reply_timestamp,
                                          wal_sender_timeout);
 
@@ -1764,7 +1778,10 @@ WalSndLoop(WalSndSendDataCallback send_data)
    initStringInfo(&reply_message);
    initStringInfo(&tmpbuf);
 
-   /* Initialize the last reply timestamp */
+   /*
+    * Initialize the last reply timestamp. That enables timeout processing
+    * from hereon.
+    */
    last_reply_timestamp = GetCurrentTimestamp();
    waiting_for_ping_response = false;
 
@@ -2879,7 +2896,11 @@ WalSndKeepaliveIfNecessary(TimestampTz now)
 {
    TimestampTz ping_time;
 
-   if (wal_sender_timeout <= 0)
+   /*
+    * Don't send keepalive messages if timeouts are globally disabled or
+    * we're doing something not partaking in timeouts.
+    */
+   if (wal_sender_timeout <= 0 || last_reply_timestamp <= 0)
        return;
 
    if (waiting_for_ping_response)