summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Kreen2010-07-27 10:41:59 +0000
committerMarko Kreen2010-07-27 10:41:59 +0000
commitd4bd10081f2632cc95f2990dcf7a8a808d5ccd04 (patch)
tree3ff639b2e5652dfe933d99e2395b158758840df9
parent68a61449ce70ff24789b8fd1eaef5faabd365bf1 (diff)
parentf59838c89d9849df13669c624f247cd90ec37269 (diff)
Merge remote branch 'mpihlak/walmgr-dev'
-rw-r--r--.gitignore6
-rw-r--r--doc/walmgr.txt15
-rw-r--r--python/conf/wal-slave.ini4
-rwxr-xr-xpython/walmgr.py121
4 files changed, 112 insertions, 34 deletions
diff --git a/.gitignore b/.gitignore
index d8791b64..73e55d84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,9 @@ sql/ticker/pgqd
tests/londiste/conf
tests/merge/conf
+build
+doc/londiste.5
+doc/man
+reconfigure.sh
+*.orig
+*.rej
diff --git a/doc/walmgr.txt b/doc/walmgr.txt
index c1390d75..e76be3e8 100644
--- a/doc/walmgr.txt
+++ b/doc/walmgr.txt
@@ -273,6 +273,19 @@ Backups will be named data.master, data.master.0, data.master.1 etc.
Script to execute before rotating away the oldest backup. If it fails
backups will not be rotated.
+
+==== slave_pg_xlog ====
+Set slave_pg_xlog to the directory on the slave where pg_xlog files get
+written to. On a restore to the slave walmgr.py will
+create a symbolic link from data/pg_xlog to this location.
+
+
+==== backup_datadir ====
+Set backup_datadir to 'no' to prevent walmgr.py from making a backup
+of the data directory when restoring to the slave. This defaults to
+'yes'
+
+
=== Sample slave.ini ===
[wal-slave]
@@ -285,4 +298,4 @@ backups will not be rotated.
partial_wals = %(slave)s/logs.partial
full_backup = %(slave)s/data.master
keep_backups = 5
-
+ backup_datadir = yes
diff --git a/python/conf/wal-slave.ini b/python/conf/wal-slave.ini
index 7f60ce6f..66ebd632 100644
--- a/python/conf/wal-slave.ini
+++ b/python/conf/wal-slave.ini
@@ -9,11 +9,15 @@ slave_stop_cmd = /etc/init.d/postgresql-8.3 stop
slave_start_cmd = /etc/init.d/postgresql-8.3 start
slave_config_dir = /etc/postgresql/8.3/main
+# alternative pg_xlog directory for slave, symlinked to pg_xlog on restore
+#slave_pg_xlog = /vol2/pg_xlog
+
slave = /var/lib/postgresql/walshipping
completed_wals = %(slave)s/logs.complete
partial_wals = %(slave)s/logs.partial
full_backup = %(slave)s/data.master
config_backup = %(slave)s/config.backup
+backup_datadir = yes
keep_backups = 0
archive_command =
diff --git a/python/walmgr.py b/python/walmgr.py
index 4aa7a119..fe7263b7 100755
--- a/python/walmgr.py
+++ b/python/walmgr.py
@@ -38,7 +38,7 @@ Switches:
"""
import os, sys, re, signal, time, traceback
-import errno, glob, ConfigParser, shutil
+import errno, glob, ConfigParser, shutil, subprocess
import pkgloader
pkgloader.require('skytools', '3.0')
@@ -109,19 +109,27 @@ class WalChunk:
class PgControlData:
"""Contents of pg_controldata"""
- def __init__(self, slave_bin, slave_data, findRestartPoint):
+ def __init__(self, bin_dir, data_dir, findRestartPoint):
"""Collect last checkpoint information from pg_controldata output"""
self.xlogid = None
self.xrecoff = None
self.timeline = None
self.wal_size = None
self.wal_name = None
- self.is_valid = False
+ self.cluster_state = None
+ self.is_shutdown = False
self.pg_version = 0
- matches = 0
- pg_controldata = "%s %s" % (os.path.join(slave_bin, "pg_controldata"), slave_data)
+ self.is_valid = False
- for line in os.popen(pg_controldata, "r"):
+ try:
+ pg_controldata = os.path.join(bin_dir, "pg_controldata")
+ pipe = subprocess.Popen([ pg_controldata, data_dir ], stdout=subprocess.PIPE)
+ except OSError:
+ # don't complain if we cannot execute it
+ return
+
+ matches = 0
+ for line in pipe.stdout.readlines():
if findRestartPoint:
m = re.match("^Latest checkpoint's REDO location:\s+([0-9A-F]+)/([0-9A-F]+)", line)
else:
@@ -142,8 +150,14 @@ class PgControlData:
if m:
matches += 1
self.pg_version = int(m.group(1))
+ m = re.match("^Database cluster state:\s+(.*$)", line)
+ if m:
+ matches += 1
+ self.cluster_state = m.group(1)
+ self.is_shutdown = (self.cluster_state == "shut down")
- if matches == 4:
+ # ran successfully and we got our needed matches
+ if pipe.wait() == 0 and matches == 5:
self.wal_name = "%08X%08X%08X" % \
(self.timeline, self.xlogid, self.xrecoff / self.wal_size)
self.is_valid = True
@@ -247,6 +261,9 @@ class WalMgr(skytools.DBScript):
def __init__(self, args):
+ if len(args) == 1 and args[0] == '--version':
+ skytools.DBScript.__init__(self, 'wal-master', args)
+
if len(args) < 2:
# need at least config file and command
usage(1)
@@ -1010,6 +1027,7 @@ STOP TIME: %(stop_time)s
use_xlog_functions = self.cf.getint("use_xlog_functions", False)
data_dir = self.cf.get("master_data")
xlog_dir = os.path.join(data_dir, "pg_xlog")
+ master_bin = self.cf.get("master_bin", "")
dst_loc = os.path.join(self.cf.get("partial_wals"), "")
@@ -1052,6 +1070,22 @@ STOP TIME: %(stop_time)s
else:
self.log.info("last complete not found, copying all")
+ # obtain the last checkpoint wal name, this can be used for
+ # limiting the amount of WAL files to copy if the database
+ # has been cleanly shut down
+ ctl = PgControlData(master_bin, data_dir, False)
+ checkpoint_wal = None
+ if ctl.is_valid:
+ if not ctl.is_shutdown:
+ # cannot rely on the checkpoint wal, should use some other method
+ self.log.info("Database state is not 'shut down', copying all")
+ else:
+ # ok, the database is shut down, we can use last checkpoint wal
+ checkpoint_wal = ctl.wal_name
+ self.log.info("last checkpoint wal: %s" % checkpoint_wal)
+ else:
+ self.log.info("Unable to obtain control file information, copying all")
+
for fn in files:
# check if interesting file
if len(fn) < 10:
@@ -1060,7 +1094,7 @@ STOP TIME: %(stop_time)s
continue
if fn.find(".") > 0:
continue
- # check if to old
+ # check if too old
if last:
dot = last.find(".")
if dot > 0:
@@ -1070,6 +1104,9 @@ STOP TIME: %(stop_time)s
else:
if fn <= last:
continue
+ # check if too new
+ if checkpoint_wal and fn > checkpoint_wal:
+ continue
# got interesting WAL
xlog = os.path.join(xlog_dir, fn)
@@ -1197,12 +1234,6 @@ STOP TIME: %(stop_time)s
self.log.debug("%s: copy done, cleanup" % srcname)
self.slave_cleanup(lstname)
- if os.path.isfile(partfile) and not srcfile == partfile:
- # Remove any partial files after restore. Only leave the partial if
- # it is actually used in recovery.
- self.log.debug("%s: removing partial not anymore needed for recovery." % partfile)
- os.remove(partfile)
-
# create a PROGRESS file to notify that postgres is processing the WAL
open(prgrfile, "w").write("1")
@@ -1215,7 +1246,7 @@ STOP TIME: %(stop_time)s
If setname is specified, the contents of that backup set directory are
restored instead of "full_backup". Also copy is used instead of rename to
- restore the directory.
+ restore the directory (unless a pg_xlog directory has been specified).
Restore to altdst if specified. Complain if it exists.
"""
@@ -1274,9 +1305,15 @@ STOP TIME: %(stop_time)s
# nothing to back up
createbackup = False
- if not setname and os.path.isdir(data_dir):
- # compatibility mode - restore without a set name and data directory
- # already exists. Move it out of the way.
+ # see if we have to make a backup of the data directory
+ backup_datadir = self.cf.getboolean('backup_datadir', True)
+
+ if os.path.isdir(data_dir) and not backup_datadir:
+ self.log.warning('backup_datadir is disabled, deleting old data dir')
+ shutil.rmtree(data_dir)
+
+ if not setname and os.path.isdir(data_dir) and backup_datadir:
+ # compatibility mode - restore without a set name and data directory exists
self.log.warning("Data directory already exists, moving it out of the way.")
createbackup = True
@@ -1288,20 +1325,38 @@ STOP TIME: %(stop_time)s
# move new data, copy if setname specified
self.log.info("%s %s to %s" % (setname and "Copy" or "Move", full_dir, data_dir))
+
+ if self.cf.get('slave_pg_xlog', ''):
+ link_xlog_dir = True
+ exclude_pg_xlog = '--exclude=pg_xlog'
+ else:
+ link_xlog_dir = False
+ exclude_pg_xlog = ''
+
if not self.not_really:
- if not setname:
+ if not setname and not link_xlog_dir:
os.rename(full_dir, data_dir)
else:
- self.exec_rsync(["--delete", "--no-relative", "--exclude=pg_xlog/*",
- os.path.join(full_dir,""), data_dir], True)
- if self.wtype == MASTER and createbackup and os.path.isdir(bak):
+ rsync_args=["--delete", "--no-relative", "--exclude=pg_xlog/*"]
+ if exclude_pg_xlog:
+ rsync_args.append(exclude_pg_xlog)
+ rsync_args += [os.path.join(full_dir, ""), data_dir]
+
+ self.exec_rsync(rsync_args, True)
+
+ if link_xlog_dir:
+ os.symlink(self.cf.get('slave_pg_xlog'), "%s/pg_xlog" % data_dir)
+
+ if (self.wtype == MASTER and createbackup and os.path.isdir(bak)):
# restore original xlog files to data_dir/pg_xlog
- # symlinked directories are dereferences
- self.exec_cmd(["cp", "-rL", "%s/pg_xlog" % bak, data_dir])
+ # symlinked directories are dereferenced
+ self.exec_cmd(["cp", "-rL", "%s/pg_xlog/" % full_dir, "%s/pg_xlog" % data_dir ])
else:
# create an archive_status directory
xlog_dir = os.path.join(data_dir, "pg_xlog")
- os.mkdir(os.path.join(xlog_dir, "archive_status"), 0700)
+ archive_path = os.path.join(xlog_dir, "archive_status")
+ if not os.path.exists(archive_path):
+ os.mkdir(archive_path, 0700)
else:
data_dir = full_dir
@@ -1653,14 +1708,14 @@ restore_command = '%s %s %s'
partial_wals = self.cf.get("partial_wals")
self.log.debug("cleaning completed wals before %s" % last_applied)
- last = self.del_wals(completed_wals, last_applied)
- if last:
- if os.path.isdir(partial_wals):
- self.log.debug("cleaning partial wals before %s" % last)
- self.del_wals(partial_wals, last)
- else:
- self.log.warning("partial_wals dir does not exist: %s"
- % partial_wals)
+ self.del_wals(completed_wals, last_applied)
+
+ if os.path.isdir(partial_wals):
+ self.log.debug("cleaning partial wals before %s" % last_applied)
+ self.del_wals(partial_wals, last_applied)
+ else:
+ self.log.warning("partial_wals dir does not exist: %s" % partial_wals)
+
self.log.debug("cleaning done")
def del_wals(self, path, last):