From 2b6702af60fa033cddc844be63e646aff8db4078 Mon Sep 17 00:00:00 2001 From: Cédric Villemain Date: Mon, 7 Nov 2011 13:51:15 +0100 Subject: Add a check cluster_id This check is responsible to confirm that the Database System Identifier found by pg_controldata is the one expected. warning and critical allowed (like check_postgres_checksum) and must be run on PostgreSQL server (like check_postgres_checkpoint) While here, I created a new function open_controldata which can be used in other places where pg_controldata is used. --- check_postgres.pl | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) (limited to 'check_postgres.pl') diff --git a/check_postgres.pl b/check_postgres.pl index 622c5d437..101550dc0 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -99,6 +99,9 @@ our %msg = ( 'bloat-nomin' => q{no relations meet the minimum bloat criteria}, 'bloat-table' => q{(db $1) table $2.$3 rows:$4 pages:$5 shouldbe:$6 ($7X) wasted size:$8 ($9)}, 'bug-report' => q{Please report these details to check_postgres@bucardo.org:}, + 'checkcluster-id' => q{Database system identifier:}, + 'checkcluster-msg' => q{cluster_id: $1}, + 'checkcluster-nomrtg'=> q{Must provide a number via the --mrtg option}, 'checkmode-state' => q{Database cluster state:}, 'checkmode-recovery' => q{in archive recovery}, 'checkpoint-baddir' => q{Invalid data_directory: "$1"}, @@ -108,7 +111,7 @@ our %msg = ( 'checkpoint-nodir' => q{Must supply a --datadir argument or set the PGDATA environment variable}, 'checkpoint-nodp' => q{Must install the Perl module Date::Parse to use the checkpoint action}, 'checkpoint-noparse' => q{Unable to parse pg_controldata output: "$1"}, - 'checkpoint-noregex' => q{Call to pg_controldata $1 failed}, + 'checkpoint-noregex' => q{Unable to find the regex for this check}, 'checkpoint-nosys' => q{Could not call pg_controldata: $1}, 'checkpoint-ok' => q{Last checkpoint was 1 second ago}, 'checkpoint-ok2' => q{Last checkpoint was $1 seconds ago}, @@ -353,7 +356,7 @@ our %msg = ( 'checkpoint-nodir' => q{Vous devez fournir un argument --datadir ou configurer la variable d'environnement PGDATA}, 'checkpoint-nodp' => q{Vous devez installer le module Perl Date::Parse pour utiliser l'action checkpoint}, 'checkpoint-noparse' => q{Incapable d'analyser le résultat de la commande pg_controldata : "$1"}, - 'checkpoint-noregex' => q{Échec de l'appel à pg_controldata $1}, + 'checkpoint-noregex' => q{La regex pour ce test n'a pas été trouvée}, 'checkpoint-nosys' => q{N'a pas pu appeler pg_controldata : $1}, 'checkpoint-ok' => q{Le dernier CHECKPOINT est survenu il y a une seconde}, 'checkpoint-ok2' => q{Le dernier CHECKPOINT est survenu il y a $1 secondes}, @@ -1112,6 +1115,7 @@ our $action_info = { backends => [1, 'Number of connections, compared to max_connections.'], bloat => [0, 'Check for table and index bloat.'], checkpoint => [1, 'Checks how long since the last checkpoint'], + cluster_id => [1, 'Checks the Database System Identifier'], commitratio => [0, 'Report if the commit ratio of a database is too low.'], connection => [0, 'Simple connection check.'], custom_query => [0, 'Run a custom query.'], @@ -1919,6 +1923,9 @@ check_dbstats() if $action eq 'dbstats'; ## Check how long since the last checkpoint check_checkpoint() if $action eq 'checkpoint'; +## Check the Datasbae System Identifier +check_cluster_id() if $action eq 'cluster_id'; + ## Check for disabled triggers check_disabled_triggers() if $action eq 'disabled_triggers'; @@ -3114,6 +3121,50 @@ sub perfname { } ## end of perfname; +sub open_controldata { + ## Requires $ENV{PGDATA} or --datadir + + ## Find the data directory, make sure it exists + my $dir = $opt{datadir} || $ENV{PGDATA}; + + if (!defined $dir or ! length $dir) { + ndie msg('checkpoint-nodir'); + } + + if (! -d $dir) { + ndie msg('checkpoint-baddir', $dir); + } + + ## Run pg_controldata + my $pgc + = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} + : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" + : 'pg_controldata'; + $COM = qq{$pgc "$dir"}; + eval { + $res = qx{$COM 2>&1}; + }; + if ($@) { + ndie msg('checkpoint-nosys', $@); + } + + ## If the path is echoed back, we most likely have an invalid data dir + if ($res =~ /$dir/) { + ndie msg('checkpoint-baddir2', $dir); + } + + if ($res =~ /WARNING: Calculated CRC checksum/) { + ndie msg('checkpoint-badver', $dir); + } + if ($res !~ /^pg_control.+\d+/) { + ndie msg('checkpoint-badver2'); + } + + ## return the pg_controldata output + return $res; +} + + sub check_archive_ready { ## Check on the number of WAL archive with status "ready" @@ -3743,6 +3794,58 @@ sub check_checkpoint { } ## end of check_checkpoint +sub check_cluster_id { + + + ## Verify the Database System Identifier provided by pg_controldata + ## Supports: Nagios, MRTG + ## One of warning or critical must be given (but not both) + ## It should run one time to find out the expected cluster-id + ## You can use --critical="0" to find out the current cluster-id + ## You can include or exclude settings as well + ## Example: + ## check_postgres_cluster_id --critical="5633695740047915125" + + my ($warning, $critical) = validate_range({type => 'integer', onlyone => 1}); + + $db->{host} = ''; + + ## Run pg_controldata, grab the cluster-id + $res = open_controldata(); + + my $regex = msg('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) + ## Just in case, check the English one as well + $regex = msg_en('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { + ndie msg('checkpoint-noregex'); + } + } + my $ident = $1; + + my $msg = msg('checkcluster-msg', $ident); + if ($MRTG) { + $opt{mrtg} or ndie msg('checksum-nomrtg'); + do_mrtg({one => $opt{mrtg} eq $ident ? 1 : 0, msg => $ident}); + } + if ($critical and $critical ne $ident) { + add_critical $msg; + } + elsif ($warning and $warning ne $ident) { + add_warning $msg; + } + elsif (!$critical and !$warning) { + add_unknown $msg; + } + else { + add_ok $msg; + } + + return; + +} ## end of check_cluster_id + + sub check_commitratio { ## Check the commitratio of one or more databases @@ -8091,6 +8194,27 @@ This action requires the Date::Parse module. For MRTG or simple output, returns the number of seconds. +=head2 B + +(C) Checks that the Database System Identifier +provided by pg_controldata is the same as last time you checked. This must run on the same +server as the database that is being checked (e.g. the -h flag will not work). +Either the I<--warning> or the I<--critical> option should be given, but not both. The value +of each one is the cluster identifier, an integer value. You can run with the special C<--critical=0> option +to find out an existing cluster identifier. + +Example 1: Find the initial identifier + + check_postgres_cluster_id --critical=0 --datadir=/var//lib/postgresql/9.0/main + +Example 2: Make sure the cluster is the same and warn if not, using the result from above. + + check_postgres_cluster_id --critical=5633695740047915135 + +For MRTG output, returns a 1 or 0 indicating success of failure of the identifier to match. A +identifier must be provided as the C<--mrtg> argument. The fourth line always gives the +current identifier. + =head2 B (C) Checks the commit ratio of all databases and complains when they are too low. -- cgit v1.2.3 From 1118e0ab0028fee46a5bdaac9de169068e4fd03f Mon Sep 17 00:00:00 2001 From: Cédric Villemain Date: Mon, 7 Nov 2011 13:57:19 +0100 Subject: Reduce code duplicates around pg_controldata Use the open_controldata where pg_controldata was used previously. Also split the code for make_sure_standby_mode to reduce code for the future option make_sure_prod. --- check_postgres.pl | 91 +++++++++++-------------------------------------------- 1 file changed, 18 insertions(+), 73 deletions(-) (limited to 'check_postgres.pl') diff --git a/check_postgres.pl b/check_postgres.pl index 101550dc0..3b5537e30 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -1521,59 +1521,37 @@ sub do_mrtg_stats { do_mrtg({one => $one, two => $two, msg => $msg}); } -sub make_sure_standby_mode { +sub make_sure_mode_is { - ## Checks if database in standby mode ## Requires $ENV{PGDATA} or --datadir - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = ''; ## Run pg_controldata, grab the mode - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $dir); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; - $regex = msg('checkmode-recovery'); + + return $last; + +} + +sub make_sure_standby_mode { + + ## Checks if database in standby mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-recovery'); if ($last =~ /$regex/) { $STANDBY = 1; } @@ -3708,50 +3686,17 @@ sub check_checkpoint { forcemrtg => 1, }); - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = ''; ## Run pg_controldata, grab the time - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $pgc); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; -- cgit v1.2.3 From 0ff408711dab18b05de26656a945fa37e363f6aa Mon Sep 17 00:00:00 2001 From: Cédric Villemain Date: Mon, 7 Nov 2011 14:00:02 +0100 Subject: Add an --assume-prod option This is based on --assume-standby-mode. Reduce the option name per suggestion from Greg (but I kept the original one for standby mode). The option is only used in check_postgres_checkpoint and allows to confirm or emit a critical if the server is not in the expected mode. Note: this can be used in other places, and maybe improved (to reduce the number og open_controldata calls) TODO/FIXME: * I found that --assume-p or --assume-s are viewed by GetOpt like the longer version of the option, a bug ? * The original code to call pg_controldata does not work in French (because of regex/locale). Why not use LANG=C in those checks where there is NO point to use locale and error prone regex ? --- check_postgres.pl | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'check_postgres.pl') diff --git a/check_postgres.pl b/check_postgres.pl index 3b5537e30..c5e5d3037 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -104,6 +104,7 @@ our %msg = ( 'checkcluster-nomrtg'=> q{Must provide a number via the --mrtg option}, 'checkmode-state' => q{Database cluster state:}, 'checkmode-recovery' => q{in archive recovery}, + 'checkmode-prod' => q{in production}, 'checkpoint-baddir' => q{Invalid data_directory: "$1"}, 'checkpoint-baddir2' => q{pg_controldata could not read the given data directory: "$1"}, 'checkpoint-badver' => q{Failed to run pg_controldata - probably the wrong version ($1)}, @@ -163,6 +164,7 @@ our %msg = ( 'logfile-stderr' => q{Logfile output has been redirected to stderr: please provide a filename}, 'logfile-syslog' => q{Database is using syslog, please specify path with --logfile option (fac=$1)}, 'mode-standby' => q{Server in standby mode}, + 'mode' => q{mode}, 'mrtg-fail' => q{Action $1 failed: $2}, 'new-ver-nocver' => q{Could not download version information for $1}, 'new-ver-badver' => q{Could not parse version information for $1}, @@ -934,6 +936,7 @@ GetOptions( 'debugoutput=s', 'no-check_postgresrc', 'assume-standby-mode', + 'assume-prod', 'action=s', 'warning=s', @@ -1206,6 +1209,7 @@ Limit options: Other options: --assume-standby-mode assume that server in continious WAL recovery mode + --assume-prod assume that server in production mode --PSQL=FILE location of the psql executable; avoid using if possible -v, --verbose verbosity level; can be used more than once to increase the level -h, --help display this help information @@ -1252,7 +1256,9 @@ if ($opt{showtime}) { ## Check the current database mode our $STANDBY = 0; +our $MASTER = 0; make_sure_standby_mode() if $opt{'assume-standby-mode'}; +make_sure_prod() if $opt{'assume-prod'}; ## We don't (usually) want to die, but want a graceful Nagios-like exit instead sub ndie { @@ -1560,6 +1566,21 @@ sub make_sure_standby_mode { } ## end of make_sure_standby_mode +sub make_sure_prod { + + ## Checks if database in production mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-prod'); + if ($last =~ /$regex/) { + $MASTER = 1; + } + + return; + +} ## end of make_sure_production_mode sub finishup { @@ -3661,7 +3682,6 @@ FROM ( } ## end of check_bloat - sub check_checkpoint { ## Checks how long in seconds since the last checkpoint on a WAL slave @@ -3674,6 +3694,9 @@ sub check_checkpoint { ## may make more sense on the master, or we may want to look at ## the WAL segments received/processed instead of the checkpoint ## timestamp. + ## This checks can use the optionnal --asume-standby-mode or + ## --assume-prod: if the mode found is not the mode assumed, a + ## CRITICAL is emitted. ## Supports: Nagios, MRTG ## Warning and critical are seconds @@ -3718,11 +3741,35 @@ sub check_checkpoint { $db->{perf} = sprintf '%s=%s;%s;%s', perfname(msg('age')), $diff, $warning, $critical; + my $mode = ''; + if ($STANDBY) { + $mode = 'STANDBY'; + } + if ($MASTER) { + $mode = 'MASTER'; + } + + ## If we have an assume flag, then honor it. + my $goodmode = 1; + if ($opt{'assume-standby-mode'} and not $STANDBY) { + $goodmode = 0; + $mode = 'NOT STANDBY'; + } + elsif ($opt{'assume-prod'} and not $MASTER) { + $goodmode = 0; + $mode = 'NOT MASTER'; + } + + if (length($mode) > 0) { + $db->{perf} .= sprintf ' %s=%s', + perfname(msg('mode')), $mode; + } + if ($MRTG) { do_mrtg({one => $diff, msg => $msg}); } - if (length $critical and $diff >= $critical) { + if ((length $critical and $diff >= $critical) or not $goodmode) { add_critical $msg; return; } @@ -7862,6 +7909,16 @@ Example: postgres@db$./check_postgres.pl --action=version --warning=8.1 --datadir /var/lib/postgresql/8.3/main/ --assume-standby-mode POSTGRES_VERSION OK: Server in standby mode | time=0.00 +=item B<--assume-prod> + +If specified, check if server in production mode is performed (--datadir is required). +The option is only relevant for (C). + +Example: + + postgres@db$./check_postgres.pl --action=checkpoint --datadir /var/lib/postgresql/8.3/main/ --assume-prod + POSTGRES_CHECKPOINT OK: Last checkpoint was 72 seconds ago | age=72;;300 mode=MASTER + =item B<-h> or B<--help> Displays a help screen with a summary of all actions and options. @@ -8132,6 +8189,8 @@ was run, as determined by parsing the call to C. Because of this pg_controldata executable must be available in the current path. Alternatively, you can set the environment variable C to the exact location of the pg_controldata executable, or you can specify C as the directory that it lives in. +It is also possible to use the special options I<--assume-prod> or +I<--assume-standby-mode>, if the mode found is not the one expected, a CRITICAL is emitted. At least one warning or critical argument must be set. -- cgit v1.2.3 From 0a4426b7a45e12d5818604c85273fe1f3de5bb38 Mon Sep 17 00:00:00 2001 From: Cédric Villemain Date: Wed, 9 Nov 2011 12:10:40 +0100 Subject: Update history --- check_postgres.pl | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'check_postgres.pl') diff --git a/check_postgres.pl b/check_postgres.pl index c5e5d3037..3539aa6cd 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -9494,6 +9494,12 @@ Items not specifically attributed are by GSM (Greg Sabino Mullane). =over 4 +=item B + + Add the --assume-prod option (Cédric Villemain) + Add the cluster_id check (Cédric Villemain) + Improve settings_checksum and checkpoint tests (Cédric Villemain) + =item B Use the full path when getting sequence information for same_schema. -- cgit v1.2.3