Fix more issues with cascading replication and timeline switches.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 23 Jan 2013 08:01:04 +0000 (10:01 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 23 Jan 2013 08:19:20 +0000 (10:19 +0200)
When a standby server follows the master using WAL archive, and it chooses
a new timeline (recovery_target_timeline='latest'), it only fetches the
timeline history file for the chosen target timeline, not any other history
files that might be missing from pg_xlog. For example, if the current
timeline is 2, and we choose 4 as the new recovery target timeline, the
history file for timeline 3 is not fetched, even if it's part of this
server's history. That's enough for the standby itself - the history file
for timeline 4 includes timeline 3 as well - but if a cascading standby
server wants to recover to timeline 3, it needs the history file. To fix,
when a new recovery target timeline is chosen, try to copy any missing
history files from the archive to pg_xlog between the old and new target
timeline.

A second similar issue was with the WAL files. When a standby recovers from
archive, and it reaches a segment that contains a switch to a new timeline,
recovery fetches only the WAL file labelled with the new timeline's ID. The
file from the new timeline contains a copy of the WAL from the old timeline
up to the point where the switch happened, and recovery recovers it from the
new file. But in streaming replication, walsender only tries to read it
from the old timeline's file. To fix, change walsender to read it from the
new file, so that it behaves the same as recovery in that sense, and doesn't
try to open the possibly nonexistent file with the old timeline's ID.

src/backend/access/transam/timeline.c
src/backend/access/transam/xlog.c
src/backend/replication/walsender.c
src/include/access/timeline.h
src/include/replication/walsender_private.h

index ad4f3162c53852140fa1cb80902fcc4851eb206c..51b37ca8f8c2febf5ba4914a3d76c57e0d1ad4db 100644 (file)
 #include "access/xlogdefs.h"
 #include "storage/fd.h"
 
+/*
+ * Copies all timeline history files with id's between 'begin' and 'end'
+ * from archive to pg_xlog.
+ */
+void
+restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
+{
+       char            path[MAXPGPATH];
+       char            histfname[MAXFNAMELEN];
+       TimeLineID tli;
+
+       for (tli = begin; tli < end; tli++)
+       {
+               if (tli == 1)
+                       continue;
+
+               TLHistoryFileName(histfname, tli);
+               if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
+                       KeepFileRestoredFromArchive(path, histfname);
+       }
+}
+
 /*
  * Try to read a timeline's history file.
  *
index 9ad92271795b5904887cf58d5843c5d6c0dcbf17..d316c97926553588bf05716d3ef59d170786b211 100644 (file)
@@ -3276,8 +3276,8 @@ rescanLatestTimeLine(void)
        bool            found;
        ListCell   *cell;
        TimeLineID      newtarget;
+       TimeLineID      oldtarget = recoveryTargetTLI;
        TimeLineHistoryEntry *currentTle = NULL;
-       /* use volatile pointer to prevent code rearrangement */
 
        newtarget = findNewestTimeLine(recoveryTargetTLI);
        if (newtarget == recoveryTargetTLI)
@@ -3336,6 +3336,12 @@ rescanLatestTimeLine(void)
        list_free_deep(expectedTLEs);
        expectedTLEs = newExpectedTLEs;
 
+       /*
+        * As in StartupXLOG(), try to ensure we have all the history files
+        * between the old target and new target in pg_xlog.
+        */
+       restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
        ereport(LOG,
                        (errmsg("new target timeline is %u",
                                        recoveryTargetTLI)));
@@ -4993,6 +4999,20 @@ StartupXLOG(void)
         */
        ThisTimeLineID = checkPoint.ThisTimeLineID;
 
+       /*
+        * Copy any missing timeline history files between 'now' and the
+        * recovery target timeline from archive to pg_xlog. While we don't need
+        * those files ourselves - the history file of the recovery target
+        * timeline covers all the previous timelines in the history too - a
+        * cascading standby server might be interested in them. Or, if you
+        * archive the WAL from this server to a different archive than the
+        * master, it'd be good for all the history files to get archived there
+        * after failover, so that you can use one of the old timelines as a
+        * PITR target. Timeline history files are small, so it's better to copy
+        * them unnecessarily than not copy them and regret later.
+        */
+       restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
+
        lastFullPageWrites = checkPoint.fullPageWrites;
 
        RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
index ba138e73da387a6489ed8cbf26fe3950cce9aa60..10e40506965fef3a9b2208b00fbbda26c2fd2e64 100644 (file)
@@ -110,6 +110,9 @@ static int  sendFile = -1;
 static XLogSegNo sendSegNo = 0;
 static uint32 sendOff = 0;
 
+/* Timeline ID of the currently open file */
+static TimeLineID      curFileTimeLine = 0;
+
 /*
  * These variables keep track of the state of the timeline we're currently
  * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
@@ -1201,8 +1204,8 @@ WalSndKill(int code, Datum arg)
  * always be one descriptor left open until the process ends, but never
  * more than one.
  */
-void
-XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
+static void
+XLogRead(char *buf, XLogRecPtr startptr, Size count)
 {
        char       *p;
        XLogRecPtr      recptr;
@@ -1222,7 +1225,7 @@ retry:
 
                startoff = recptr % XLogSegSize;
 
-               if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || sendTimeLine != tli)
+               if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
                {
                        char            path[MAXPGPATH];
 
@@ -1230,9 +1233,45 @@ retry:
                        if (sendFile >= 0)
                                close(sendFile);
 
-                       sendTimeLine = tli;
                        XLByteToSeg(recptr, sendSegNo);
-                       XLogFilePath(path, sendTimeLine, sendSegNo);
+
+                       /*-------
+                        * When reading from a historic timeline, and there is a timeline
+                        * switch within this segment, read from the WAL segment belonging
+                        * to the new timeline.
+                        *
+                        * For example, imagine that this server is currently on timeline
+                        * 5, and we're streaming timeline 4. The switch from timeline 4
+                        * to 5 happened at 0/13002088. In pg_xlog, we have these files:
+                        *
+                        * ...
+                        * 000000040000000000000012
+                        * 000000040000000000000013
+                        * 000000050000000000000013
+                        * 000000050000000000000014
+                        * ...
+                        *
+                        * In this situation, when requested to send the WAL from
+                        * segment 0x13, on timeline 4, we read the WAL from file
+                        * 000000050000000000000013. Archive recovery prefers files from
+                        * newer timelines, so if the segment was restored from the
+                        * archive on this server, the file belonging to the old timeline,
+                        * 000000040000000000000013, might not exist. Their contents are
+                        * equal up to the switchpoint, because at a timeline switch, the
+                        * used portion of the old segment is copied to the new file.
+                        *-------
+                        */
+                       curFileTimeLine = sendTimeLine;
+                       if (sendTimeLineIsHistoric)
+                       {
+                               XLogSegNo endSegNo;
+
+                               XLByteToSeg(sendTimeLineValidUpto, endSegNo);
+                               if (sendSegNo == endSegNo)
+                                       curFileTimeLine = sendTimeLineNextTLI;
+                       }
+
+                       XLogFilePath(path, curFileTimeLine, sendSegNo);
 
                        sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
                        if (sendFile < 0)
@@ -1246,7 +1285,7 @@ retry:
                                        ereport(ERROR,
                                                        (errcode_for_file_access(),
                                                         errmsg("requested WAL segment %s has already been removed",
-                                                                       XLogFileNameP(sendTimeLine, sendSegNo))));
+                                                                       XLogFileNameP(curFileTimeLine, sendSegNo))));
                                else
                                        ereport(ERROR,
                                                        (errcode_for_file_access(),
@@ -1263,7 +1302,7 @@ retry:
                                ereport(ERROR,
                                                (errcode_for_file_access(),
                                                 errmsg("could not seek in log segment %s to offset %u: %m",
-                                                               XLogFileNameP(sendTimeLine, sendSegNo),
+                                                               XLogFileNameP(curFileTimeLine, sendSegNo),
                                                                startoff)));
                        sendOff = startoff;
                }
@@ -1280,7 +1319,7 @@ retry:
                        ereport(ERROR,
                                        (errcode_for_file_access(),
                        errmsg("could not read from log segment %s, offset %u, length %lu: %m",
-                                  XLogFileNameP(sendTimeLine, sendSegNo),
+                                  XLogFileNameP(curFileTimeLine, sendSegNo),
                                   sendOff, (unsigned long) segbytes)));
                }
 
@@ -1524,7 +1563,7 @@ XLogSend(bool *caughtup)
         * calls.
         */
        enlargeStringInfo(&output_message, nbytes);
-       XLogRead(&output_message.data[output_message.len], sendTimeLine, startptr, nbytes);
+       XLogRead(&output_message.data[output_message.len], startptr, nbytes);
        output_message.len += nbytes;
        output_message.data[output_message.len] = '\0';
 
index 7d45fcad8a4379f101d98cc69f8348c85936dbb2..2e5e9a42a386d5aebe4e8cc262333280b9ee1e81 100644 (file)
@@ -35,6 +35,7 @@ extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
 extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
                                         XLogRecPtr switchpoint, char *reason);
 extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size);
+extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end);
 extern bool tliInHistory(TimeLineID tli, List *expectedTLIs);
 extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history);
 extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history,
index 8f479fda7e5b91158dd097007cdd43f8b876975c..7eaa21b9f7e6eb8ce02a96d1d35b0e916356dc9f 100644 (file)
@@ -95,7 +95,6 @@ extern WalSndCtlData *WalSndCtl;
 
 
 extern void WalSndSetState(WalSndState state);
-extern void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count);
 
 /*
  * Internal functions for parsing the replication grammar, in repl_gram.y and