Fix over-eager ping'ing in logical replication receiver.
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 5 Sep 2020 00:20:05 +0000 (20:20 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 5 Sep 2020 00:33:36 +0000 (20:33 -0400)
Commit 3f60f690f only partially fixed the broken-status-tracking
issue in LogicalRepApplyLoop: we need ping_sent to have the same
lifetime as last_recv_timestamp.  The effects are much less serious
than what that commit fixed, though.  AFAICS this would just lead to
extra ping requests being sent, once per second until the sender
responds.  Still, it's a bug, so backpatch to v10 as before.

Discussion: https://postgr.es/m/959627.1599248476@sss.pgh.pa.us

src/backend/replication/logical/worker.c

index 3c7ed80f93412d94120183e21c27bb995dc1ecf1..c37aafed0d29eb5a97d07afa5b131fd411e2181b 100644 (file)
@@ -2060,6 +2060,7 @@ static void
 LogicalRepApplyLoop(XLogRecPtr last_received)
 {
    TimestampTz last_recv_timestamp = GetCurrentTimestamp();
+   bool        ping_sent = false;
 
    /*
     * Init the ApplyMessageContext which we clean up after each replication
@@ -2080,6 +2081,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
    /* mark as idle, before starting to loop */
    pgstat_report_activity(STATE_IDLE, NULL);
 
+   /* This outer loop iterates once per wait. */
    for (;;)
    {
        pgsocket    fd = PGINVALID_SOCKET;
@@ -2087,7 +2089,6 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
        int         len;
        char       *buf = NULL;
        bool        endofstream = false;
-       bool        ping_sent = false;
        long        wait_time;
 
        CHECK_FOR_INTERRUPTS();
@@ -2098,7 +2099,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 
        if (len != 0)
        {
-           /* Process the data */
+           /* Loop to process all available data (without blocking). */
            for (;;)
            {
                CHECK_FOR_INTERRUPTS();
@@ -2267,10 +2268,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
                    ereport(ERROR,
                            (errmsg("terminating logical replication worker due to timeout")));
 
-               /*
-                * We didn't receive anything new, for half of receiver
-                * replication timeout. Ping the server.
-                */
+               /* Check to see if it's time for a ping. */
                if (!ping_sent)
                {
                    timeout = TimestampTzPlusMilliseconds(last_recv_timestamp,