#include "datapagemap.h"
#include "file_ops.h"
#include "filemap.h"
+#include "lib/stringinfo.h"
#include "pg_rewind.h"
#include "port/pg_bswap.h"
#include "rewind_source.h"
/*
- * Files are fetched max CHUNKSIZE bytes at a time.
- *
- * (This only applies to files that are copied in whole, or for truncated
- * files where we copy the tail. Relation files, where we know the individual
- * blocks that need to be fetched, are fetched in BLCKSZ chunks.)
+ * Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a
+ * maximum of MAX_CHUNKS_PER_QUERY chunks in a single query.
*/
-#define CHUNKSIZE 1000000
+#define MAX_CHUNK_SIZE (1024 * 1024)
+#define MAX_CHUNKS_PER_QUERY 1000
+
+/* represents a request to fetch a piece of a file from the source */
+typedef struct
+{
+ const char *path; /* path relative to data directory root */
+ off_t offset;
+ size_t length;
+} fetch_range_request;
typedef struct
{
rewind_source common; /* common interface functions */
PGconn *conn;
- bool copy_started;
+
+ /*
+ * Queue of chunks that have been requested with the queue_fetch_range()
+ * function, but have not been fetched from the remote server yet.
+ */
+ int num_requests;
+ fetch_range_request request_queue[MAX_CHUNKS_PER_QUERY];
+
+ /* temporary space for process_queued_fetch_requests() */
+ StringInfoData paths;
+ StringInfoData offsets;
+ StringInfoData lengths;
} libpq_source;
static void init_libpq_conn(PGconn *conn);
static char *run_simple_query(PGconn *conn, const char *sql);
static void run_simple_command(PGconn *conn, const char *sql);
+static void appendArrayEscapedString(StringInfo buf, const char *str);
+
+static void process_queued_fetch_requests(libpq_source *src);
/* public interface functions */
static void libpq_traverse_files(rewind_source *source,
src->conn = conn;
+ initStringInfo(&src->paths);
+ initStringInfo(&src->offsets);
+ initStringInfo(&src->lengths);
+
return &src->common;
}
run_simple_command(conn, "SET lock_timeout = 0");
run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0");
+ /*
+ * we don't intend to do any updates, put the connection in read-only mode
+ * to keep us honest
+ */
+ run_simple_command(conn, "SET default_transaction_read_only = on");
+
/* secure search_path */
res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
PQresultErrorMessage(res));
PQclear(res);
- /*
- * Check that the server is not in hot standby mode. There is no
- * fundamental reason that couldn't be made to work, but it doesn't
- * currently because we use a temporary table. Better to check for it
- * explicitly than error out, for a better error message.
- */
- str = run_simple_query(conn, "SELECT pg_is_in_recovery()");
- if (strcmp(str, "f") != 0)
- pg_fatal("source server must not be in recovery mode");
- pg_free(str);
-
/*
* Also check that full_page_writes is enabled. We can get torn pages if
* a page is modified while we read it with pg_read_binary_file(), and we
if (strcmp(str, "on") != 0)
pg_fatal("full_page_writes must be enabled in the source server");
pg_free(str);
+
+ /* Prepare a statement we'll use to fetch files */
+ res = PQprepare(conn, "fetch_chunks_stmt",
+ "SELECT path, begin,\n"
+ " pg_read_binary_file(path, begin, len, true) AS chunk\n"
+ "FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)",
+ 3, NULL);
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ pg_fatal("could not prepare statement to fetch file contents: %s",
+ PQresultErrorMessage(res));
+ PQclear(res);
}
/*
size_t len)
{
libpq_source *src = (libpq_source *) source;
- uint64 begin = off;
- uint64 end = off + len;
/*
- * On first call, create a temporary table, and start COPYing to it.
- * We will load it with the list of blocks that we need to fetch.
+ * Does this request happen to be a continuation of the previous chunk? If
+ * so, merge it with the previous one.
+ *
+ * XXX: We use pointer equality to compare the path. That's good enough
+ * for our purposes; the caller always passes the same pointer for the
+ * same filename. If it didn't, we would fail to merge requests, but it
+ * wouldn't affect correctness.
*/
- if (!src->copy_started)
+ if (src->num_requests > 0)
{
- PGresult *res;
+ fetch_range_request *prev = &src->request_queue[src->num_requests - 1];
- run_simple_command(src->conn, "CREATE TEMPORARY TABLE fetchchunks(path text, begin int8, len int4)");
+ if (prev->offset + prev->length == off &&
+ prev->length < MAX_CHUNK_SIZE &&
+ prev->path == path)
+ {
+ /*
+ * Extend the previous request to cover as much of this new
+ * request as possible, without exceeding MAX_CHUNK_SIZE.
+ */
+ size_t thislen;
- res = PQexec(src->conn, "COPY fetchchunks FROM STDIN");
- if (PQresultStatus(res) != PGRES_COPY_IN)
- pg_fatal("could not send file list: %s",
- PQresultErrorMessage(res));
- PQclear(res);
+ thislen = Min(len, MAX_CHUNK_SIZE - prev->length);
+ prev->length += thislen;
- src->copy_started = true;
- }
+ off += thislen;
+ len -= thislen;
- /*
- * Write the file range to a temporary table in the server.
- *
- * The range is sent to the server as a COPY formatted line, to be inserted
- * into the 'fetchchunks' temporary table. The libpq_finish_fetch() uses
- * the temporary table to actually fetch the data.
- */
+ /*
+ * Fall through to create new requests for any remaining 'len'
+ * that didn't fit in the previous chunk.
+ */
+ }
+ }
- /* Split the range into CHUNKSIZE chunks */
- while (end - begin > 0)
+ /* Divide the request into pieces of MAX_CHUNK_SIZE bytes each */
+ while (len > 0)
{
- char linebuf[MAXPGPATH + 23];
- unsigned int len;
-
- /* Fine as long as CHUNKSIZE is not bigger than UINT32_MAX */
- if (end - begin > CHUNKSIZE)
- len = CHUNKSIZE;
- else
- len = (unsigned int) (end - begin);
+ int32 thislen;
- snprintf(linebuf, sizeof(linebuf), "%s\t" UINT64_FORMAT "\t%u\n", path, begin, len);
+ /* if the queue is full, perform all the work queued up so far */
+ if (src->num_requests == MAX_CHUNKS_PER_QUERY)
+ process_queued_fetch_requests(src);
- if (PQputCopyData(src->conn, linebuf, strlen(linebuf)) != 1)
- pg_fatal("could not send COPY data: %s",
- PQerrorMessage(src->conn));
+ thislen = Min(len, MAX_CHUNK_SIZE);
+ src->request_queue[src->num_requests].path = path;
+ src->request_queue[src->num_requests].offset = off;
+ src->request_queue[src->num_requests].length = thislen;
+ src->num_requests++;
- begin += len;
+ off += thislen;
+ len -= thislen;
}
}
/*
- * Receive all the queued chunks and write them to the target data directory.
+ * Fetch all the queued chunks and write them to the target data directory.
*/
static void
libpq_finish_fetch(rewind_source *source)
{
- libpq_source *src = (libpq_source *) source;
+ process_queued_fetch_requests((libpq_source *) source);
+}
+
+static void
+process_queued_fetch_requests(libpq_source *src)
+{
+ const char *params[3];
PGresult *res;
- const char *sql;
+ int chunkno;
- if (PQputCopyEnd(src->conn, NULL) != 1)
- pg_fatal("could not send end-of-COPY: %s",
- PQerrorMessage(src->conn));
+ if (src->num_requests == 0)
+ return;
- while ((res = PQgetResult(src->conn)) != NULL)
+ pg_log_debug("getting %d file chunks", src->num_requests);
+
+ /*
+ * The prepared statement, 'fetch_chunks_stmt', takes three arrays with
+ * the same length as parameters: paths, offsets and lengths. Construct
+ * the string representations of them.
+ */
+ resetStringInfo(&src->paths);
+ resetStringInfo(&src->offsets);
+ resetStringInfo(&src->lengths);
+
+ appendStringInfoChar(&src->paths, '{');
+ appendStringInfoChar(&src->offsets, '{');
+ appendStringInfoChar(&src->lengths, '{');
+ for (int i = 0; i < src->num_requests; i++)
{
- if (PQresultStatus(res) != PGRES_COMMAND_OK)
- pg_fatal("unexpected result while sending file list: %s",
- PQresultErrorMessage(res));
- PQclear(res);
+ fetch_range_request *rq = &src->request_queue[i];
+
+ if (i > 0)
+ {
+ appendStringInfoChar(&src->paths, ',');
+ appendStringInfoChar(&src->offsets, ',');
+ appendStringInfoChar(&src->lengths, ',');
+ }
+
+ appendArrayEscapedString(&src->paths, rq->path);
+ appendStringInfo(&src->offsets, INT64_FORMAT, (int64) rq->offset);
+ appendStringInfo(&src->lengths, INT64_FORMAT, (int64) rq->length);
}
+ appendStringInfoChar(&src->paths, '}');
+ appendStringInfoChar(&src->offsets, '}');
+ appendStringInfoChar(&src->lengths, '}');
/*
- * We've now copied the list of file ranges that we need to fetch to the
- * temporary table. Now, actually fetch all of those ranges.
+ * Execute the prepared statement.
*/
- sql =
- "SELECT path, begin,\n"
- " pg_read_binary_file(path, begin, len, true) AS chunk\n"
- "FROM fetchchunks\n";
+ params[0] = src->paths.data;
+ params[1] = src->offsets.data;
+ params[2] = src->lengths.data;
- if (PQsendQueryParams(src->conn, sql, 0, NULL, NULL, NULL, NULL, 1) != 1)
+ if (PQsendQueryPrepared(src->conn, "fetch_chunks_stmt", 3, params, NULL, NULL, 1) != 1)
pg_fatal("could not send query: %s", PQerrorMessage(src->conn));
- pg_log_debug("getting file chunks");
-
if (PQsetSingleRowMode(src->conn) != 1)
pg_fatal("could not set libpq connection to single row mode");
* chunk bytea -- file content
*----
*/
+ chunkno = 0;
while ((res = PQgetResult(src->conn)) != NULL)
{
+ fetch_range_request *rq = &src->request_queue[chunkno];
char *filename;
int filenamelen;
int64 chunkoff;
PQresultErrorMessage(res));
}
+ if (chunkno > src->num_requests)
+ pg_fatal("received more data chunks than requested");
+
/* sanity check the result set */
if (PQnfields(res) != 3 || PQntuples(res) != 1)
pg_fatal("unexpected result set size while fetching remote files");
* If a file has been deleted on the source, remove it on the target
* as well. Note that multiple unlink() calls may happen on the same
* file if multiple data chunks are associated with it, hence ignore
- * unconditionally anything missing. If this file is not a relation
- * data file, then it has been already truncated when creating the
- * file chunk list at the previous execution of the filemap.
+ * unconditionally anything missing.
*/
if (PQgetisnull(res, 0, 2))
{
pg_log_debug("received null value for chunk for file \"%s\", file has been deleted",
filename);
remove_target_file(filename, true);
- pg_free(filename);
- PQclear(res);
- continue;
}
+ else
+ {
+ pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d",
+ filename, chunkoff, chunksize);
+
+ if (strcmp(filename, rq->path) != 0)
+ {
+ pg_fatal("received data for file \"%s\", when requested for \"%s\"",
+ filename, rq->path);
+ }
+ if (chunkoff != rq->offset)
+ pg_fatal("received data at offset " INT64_FORMAT " of file \"%s\", when requested for offset " INT64_FORMAT,
+ chunkoff, rq->path, (int64) rq->offset);
- pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d",
- filename, chunkoff, chunksize);
+ /*
+ * We should not receive receive more data than we requested, or
+ * pg_read_binary_file() messed up. We could receive less,
+ * though, if the file was truncated in the source after we
+ * checked its size. That's OK, there should be a WAL record of
+ * the truncation, which will get replayed when you start the
+ * target system for the first time after pg_rewind has completed.
+ */
+ if (chunksize > rq->length)
+ pg_fatal("received more than requested for file \"%s\"", rq->path);
- open_target_file(filename, false);
+ open_target_file(filename, false);
- write_target_range(chunk, chunkoff, chunksize);
+ write_target_range(chunk, chunkoff, chunksize);
+ }
pg_free(filename);
PQclear(res);
+ chunkno++;
}
+ if (chunkno != src->num_requests)
+ pg_fatal("unexpected number of data chunks received");
+
+ src->num_requests = 0;
+}
+
+/*
+ * Escape a string to be used as element in a text array constant
+ */
+static void
+appendArrayEscapedString(StringInfo buf, const char *str)
+{
+ appendStringInfoCharMacro(buf, '\"');
+ while (*str)
+ {
+ char ch = *str;
+
+ if (ch == '"' || ch == '\\')
+ appendStringInfoCharMacro(buf, '\\');
+
+ appendStringInfoCharMacro(buf, ch);
+
+ str++;
+ }
+ appendStringInfoCharMacro(buf, '\"');
}
/*
static void
libpq_destroy(rewind_source *source)
{
- pfree(source);
+ libpq_source *src = (libpq_source *) source;
+
+ pfree(src->paths.data);
+ pfree(src->offsets.data);
+ pfree(src->lengths.data);
+ pfree(src);
+
/* NOTE: we don't close the connection here, as it was not opened by us. */
}
static ControlFileData ControlFile_target;
static ControlFileData ControlFile_source;
+static ControlFileData ControlFile_source_after;
const char *progname;
int WalSegSz;
XLogRecPtr endrec;
TimeLineID endtli;
ControlFileData ControlFile_new;
+ size_t size;
+ char *buffer;
/*
* Execute the actions in the file map, fetching data from the source
}
}
- /*
- * We've now copied the list of file ranges that we need to fetch to the
- * temporary table. Now, actually fetch all of those ranges.
- */
+ /* Complete any remaining range-fetches that we queued up above. */
source->finish_fetch(source);
close_target_file();
progress_report(true);
+ /*
+ * Fetch the control file from the source last. This ensures that the
+ * minRecoveryPoint is up-to-date.
+ */
+ buffer = source->fetch_file(source, "global/pg_control", &size);
+ digestControlFile(&ControlFile_source_after, buffer, size);
+ pg_free(buffer);
+
+ /*
+ * Sanity check: If the source is a local system, the control file should
+ * not have changed since we started.
+ *
+ * XXX: We assume it hasn't been modified, but actually, what could go
+ * wrong? The logic handles a libpq source that's modified concurrently,
+ * why not a local datadir?
+ */
+ if (datadir_source &&
+ memcmp(&ControlFile_source, &ControlFile_source_after,
+ sizeof(ControlFileData)) != 0)
+ {
+ pg_fatal("source system was modified while pg_rewind was running");
+ }
+
if (showprogress)
pg_log_info("creating backup label and updating control file");
- createBackupLabel(chkptredo, chkpttli, chkptrec);
/*
- * Update control file of target. Make it ready to perform archive
- * recovery when restarting.
+ * Create a backup label file, to tell the target where to begin the WAL
+ * replay. Normally, from the last common checkpoint between the source
+ * and the target. But if the source is a standby server, it's possible
+ * that the last common checkpoint is *after* the standby's restartpoint.
+ * That implies that the source server has applied the checkpoint record,
+ * but hasn't perfomed a corresponding restartpoint yet. Make sure we
+ * start at the restartpoint's redo point in that case.
*
- * Like in an online backup, it's important that we replay all the WAL
- * that was generated while we copied the files over. To enforce that, set
- * 'minRecoveryPoint' in the control file.
+ * Use the old version of the source's control file for this. The server
+ * might have finished the restartpoint after we started copying files,
+ * but we must begin from the redo point at the time that started copying.
*/
- memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData));
+ if (ControlFile_source.checkPointCopy.redo < chkptredo)
+ {
+ chkptredo = ControlFile_source.checkPointCopy.redo;
+ chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+ chkptrec = ControlFile_source.checkPoint;
+ }
+ createBackupLabel(chkptredo, chkpttli, chkptrec);
+ /*
+ * Update control file of target, to tell the target how far it must
+ * replay the WAL (minRecoveryPoint).
+ */
if (connstr_source)
{
- endrec = source->get_current_wal_insert_lsn(source);
- endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+ /*
+ * The source is a live server. Like in an online backup, it's
+ * important that we recover all the WAL that was generated while we
+ * were copying files.
+ */
+ if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
+ {
+ /*
+ * Source is a standby server. We must replay to its
+ * minRecoveryPoint.
+ */
+ endrec = ControlFile_source_after.minRecoveryPoint;
+ endtli = ControlFile_source_after.minRecoveryPointTLI;
+ }
+ else
+ {
+ /*
+ * Source is a production, non-standby, server. We must replay to
+ * the last WAL insert location.
+ */
+ if (ControlFile_source_after.state != DB_IN_PRODUCTION)
+ pg_fatal("source system was in unexpected state at end of rewind");
+
+ endrec = source->get_current_wal_insert_lsn(source);
+ endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+ }
}
else
{
- endrec = ControlFile_source.checkPoint;
- endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+ /*
+ * Source is a local data directory. It should've shut down cleanly,
+ * and we must replay to the latest shutdown checkpoint.
+ */
+ endrec = ControlFile_source_after.checkPoint;
+ endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
}
+
+ memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
ControlFile_new.minRecoveryPoint = endrec;
ControlFile_new.minRecoveryPointTLI = endtli;
ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
--- /dev/null
+#
+# Test using a standby server as the source.
+#
+# This sets up three nodes: A, B and C. First, A is the primary,
+# B follows A, and C follows B:
+#
+# A (primary) <--- B (standby) <--- C (standby)
+#
+#
+# Then we promote C, and insert some divergent rows in A and C:
+#
+# A (primary) <--- B (standby) C (primary)
+#
+#
+# Finally, we run pg_rewind on C, to re-point it at B again:
+#
+# A (primary) <--- B (standby) <--- C (standby)
+#
+#
+# The test is similar to the basic tests, but since we're dealing with
+# three nodes, not two, we cannot use most of the RewindTest functions
+# as is.
+
+use strict;
+use warnings;
+use TestLib;
+use Test::More tests => 3;
+
+use FindBin;
+use lib $FindBin::RealBin;
+use File::Copy;
+use PostgresNode;
+use RewindTest;
+
+my $tmp_folder = TestLib::tempdir;
+
+my $node_a;
+my $node_b;
+my $node_c;
+
+# Set up node A, as primary
+#
+# A (primary)
+
+setup_cluster('a');
+start_primary();
+$node_a = $node_primary;
+
+# Create a test table and insert a row in primary.
+$node_a->safe_psql('postgres', "CREATE TABLE tbl1 (d text)");
+$node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A')");
+primary_psql("CHECKPOINT");
+
+# Set up node B and C, as cascaded standbys
+#
+# A (primary) <--- B (standby) <--- C (standby)
+$node_a->backup('my_backup');
+$node_b = get_new_node('node_b');
+$node_b->init_from_backup($node_a, 'my_backup', has_streaming => 1);
+$node_b->set_standby_mode();
+$node_b->start;
+
+$node_b->backup('my_backup');
+$node_c = get_new_node('node_c');
+$node_c->init_from_backup($node_b, 'my_backup', has_streaming => 1);
+$node_c->set_standby_mode();
+$node_c->start;
+
+# Insert additional data on A, and wait for both standbys to catch up.
+$node_a->safe_psql('postgres',
+ "INSERT INTO tbl1 values ('in A, before promotion')");
+$node_a->safe_psql('postgres', 'CHECKPOINT');
+
+my $lsn = $node_a->lsn('insert');
+$node_a->wait_for_catchup('node_b', 'write', $lsn);
+$node_b->wait_for_catchup('node_c', 'write', $lsn);
+
+# Promote C
+#
+# A (primary) <--- B (standby) C (primary)
+
+$node_c->promote;
+$node_c->safe_psql('postgres', "checkpoint");
+
+
+# Insert a row in A. This causes A/B and C to have "diverged", so that it's
+# no longer possible to just apply the standy's logs over primary directory
+# - you need to rewind.
+$node_a->safe_psql('postgres',
+ "INSERT INTO tbl1 VALUES ('in A, after C was promoted')");
+
+# Also insert a new row in the standby, which won't be present in the
+# old primary.
+$node_c->safe_psql('postgres',
+ "INSERT INTO tbl1 VALUES ('in C, after C was promoted')");
+
+
+#
+# All set up. We're ready to run pg_rewind.
+#
+my $node_c_pgdata = $node_c->data_dir;
+
+# Stop the node and be ready to perform the rewind.
+$node_c->stop('fast');
+
+# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
+copy(
+ "$node_c_pgdata/postgresql.conf",
+ "$tmp_folder/node_c-postgresql.conf.tmp");
+
+{
+ # Temporarily unset PGAPPNAME so that the server doesn't
+ # inherit it. Otherwise this could affect libpqwalreceiver
+ # connections in confusing ways.
+ local %ENV = %ENV;
+ delete $ENV{PGAPPNAME};
+
+ # Do rewind using a remote connection as source, generating
+ # recovery configuration automatically.
+ command_ok(
+ [
+ 'pg_rewind', "--debug",
+ "--source-server", $node_b->connstr('postgres'),
+ "--target-pgdata=$node_c_pgdata", "--no-sync",
+ "--write-recovery-conf"
+ ],
+ 'pg_rewind remote');
+}
+
+# Now move back postgresql.conf with old settings
+move(
+ "$tmp_folder/node_c-postgresql.conf.tmp",
+ "$node_c_pgdata/postgresql.conf");
+
+# Restart the node.
+$node_c->start;
+
+# set RewindTest::node_primary to point to the rewinded node, so that we can
+# use check_query()
+$node_primary = $node_c;
+
+# Run some checks to verify that C has been successfully rewound,
+# and connected back to follow B.
+
+check_query(
+ 'SELECT * FROM tbl1',
+ qq(in A
+in A, before promotion
+in A, after C was promoted
+),
+ 'table content after rewind');
+
+# Insert another row, and observe that it's cascaded from A to B to C.
+$node_a->safe_psql('postgres',
+ "INSERT INTO tbl1 values ('in A, after rewind')");
+
+$lsn = $node_a->lsn('insert');
+$node_b->wait_for_catchup('node_c', 'write', $lsn);
+
+check_query(
+ 'SELECT * FROM tbl1',
+ qq(in A
+in A, before promotion
+in A, after C was promoted
+in A, after rewind
+),
+ 'table content after rewind and insert');
+
+# clean up
+$node_a->teardown_node;
+$node_b->teardown_node;
+$node_c->teardown_node;
+
+exit(0);