diff options
author | Greg Sabino Mullane | 2011-11-23 14:16:17 +0000 |
---|---|---|
committer | Greg Sabino Mullane | 2011-11-23 14:16:17 +0000 |
commit | 5cc0f19de74925d624b9ab3c51c8b034e9d66057 (patch) | |
tree | f1b061029bd028e6d44717853a3b13bc8c19c594 | |
parent | 8e2b0f1a35d3b232d2faa3616a0d478ee1bb24f9 (diff) | |
parent | 0a4426b7a45e12d5818604c85273fe1f3de5bb38 (diff) |
Merge branch 'master' of bucardo.org:/var/lib/git/check_postgres
-rw-r--r-- | MANIFEST | 1 | ||||
-rwxr-xr-x | check_postgres.pl | 288 | ||||
-rw-r--r-- | t/02_checkpoint.t | 6 | ||||
-rw-r--r-- | t/02_cluster_id.t | 80 | ||||
-rw-r--r-- | t/02_settings_checksum.t | 10 |
5 files changed, 306 insertions, 79 deletions
@@ -17,6 +17,7 @@ t/02_autovac_freeze.t t/02_backends.t t/02_bloat.t t/02_checkpoint.t +t/02_cluster_id.t t/02_connection.t t/02_custom_query.t t/02_database_size.t diff --git a/check_postgres.pl b/check_postgres.pl index eebed9bfe..8a71853ad 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -99,8 +99,12 @@ our %msg = ( 'bloat-nomin' => q{no relations meet the minimum bloat criteria}, 'bloat-table' => q{(db $1) table $2.$3 rows:$4 pages:$5 shouldbe:$6 ($7X) wasted size:$8 ($9)}, 'bug-report' => q{Please report these details to check_postgres@bucardo.org:}, + 'checkcluster-id' => q{Database system identifier:}, + 'checkcluster-msg' => q{cluster_id: $1}, + 'checkcluster-nomrtg'=> q{Must provide a number via the --mrtg option}, 'checkmode-state' => q{Database cluster state:}, 'checkmode-recovery' => q{in archive recovery}, + 'checkmode-prod' => q{in production}, 'checkpoint-baddir' => q{Invalid data_directory: "$1"}, 'checkpoint-baddir2' => q{pg_controldata could not read the given data directory: "$1"}, 'checkpoint-badver' => q{Failed to run pg_controldata - probably the wrong version ($1)}, @@ -108,7 +112,7 @@ our %msg = ( 'checkpoint-nodir' => q{Must supply a --datadir argument or set the PGDATA environment variable}, 'checkpoint-nodp' => q{Must install the Perl module Date::Parse to use the checkpoint action}, 'checkpoint-noparse' => q{Unable to parse pg_controldata output: "$1"}, - 'checkpoint-noregex' => q{Call to pg_controldata $1 failed}, + 'checkpoint-noregex' => q{Unable to find the regex for this check}, 'checkpoint-nosys' => q{Could not call pg_controldata: $1}, 'checkpoint-ok' => q{Last checkpoint was 1 second ago}, 'checkpoint-ok2' => q{Last checkpoint was $1 seconds ago}, @@ -160,6 +164,7 @@ our %msg = ( 'logfile-stderr' => q{Logfile output has been redirected to stderr: please provide a filename}, 'logfile-syslog' => q{Database is using syslog, please specify path with --logfile option (fac=$1)}, 'mode-standby' => q{Server in standby mode}, + 'mode' => q{mode}, 'mrtg-fail' => q{Action $1 failed: $2}, 'new-ver-nocver' => q{Could not download version information for $1}, 'new-ver-badver' => q{Could not parse version information for $1}, @@ -353,7 +358,7 @@ our %msg = ( 'checkpoint-nodir' => q{Vous devez fournir un argument --datadir ou configurer la variable d'environnement PGDATA}, 'checkpoint-nodp' => q{Vous devez installer le module Perl Date::Parse pour utiliser l'action checkpoint}, 'checkpoint-noparse' => q{Incapable d'analyser le résultat de la commande pg_controldata : "$1"}, - 'checkpoint-noregex' => q{Échec de l'appel à pg_controldata $1}, + 'checkpoint-noregex' => q{La regex pour ce test n'a pas été trouvée}, 'checkpoint-nosys' => q{N'a pas pu appeler pg_controldata : $1}, 'checkpoint-ok' => q{Le dernier CHECKPOINT est survenu il y a une seconde}, 'checkpoint-ok2' => q{Le dernier CHECKPOINT est survenu il y a $1 secondes}, @@ -931,6 +936,7 @@ GetOptions( 'debugoutput=s', 'no-check_postgresrc', 'assume-standby-mode', + 'assume-prod', 'action=s', 'warning=s', @@ -1112,6 +1118,7 @@ our $action_info = { backends => [1, 'Number of connections, compared to max_connections.'], bloat => [0, 'Check for table and index bloat.'], checkpoint => [1, 'Checks how long since the last checkpoint'], + cluster_id => [1, 'Checks the Database System Identifier'], commitratio => [0, 'Report if the commit ratio of a database is too low.'], connection => [0, 'Simple connection check.'], custom_query => [0, 'Run a custom query.'], @@ -1202,6 +1209,7 @@ Limit options: Other options: --assume-standby-mode assume that server in continious WAL recovery mode + --assume-prod assume that server in production mode --PSQL=FILE location of the psql executable; avoid using if possible -v, --verbose verbosity level; can be used more than once to increase the level -h, --help display this help information @@ -1248,7 +1256,9 @@ if ($opt{showtime}) { ## Check the current database mode our $STANDBY = 0; +our $MASTER = 0; make_sure_standby_mode() if $opt{'assume-standby-mode'}; +make_sure_prod() if $opt{'assume-prod'}; ## We don't (usually) want to die, but want a graceful Nagios-like exit instead sub ndie { @@ -1517,59 +1527,37 @@ sub do_mrtg_stats { do_mrtg({one => $one, two => $two, msg => $msg}); } -sub make_sure_standby_mode { +sub make_sure_mode_is { - ## Checks if database in standby mode ## Requires $ENV{PGDATA} or --datadir - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = '<none>'; ## Run pg_controldata, grab the mode - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $dir); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkmode-state'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; - $regex = msg('checkmode-recovery'); + + return $last; + +} + +sub make_sure_standby_mode { + + ## Checks if database in standby mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-recovery'); if ($last =~ /$regex/) { $STANDBY = 1; } @@ -1578,6 +1566,21 @@ sub make_sure_standby_mode { } ## end of make_sure_standby_mode +sub make_sure_prod { + + ## Checks if database in production mode + ## Requires $ENV{PGDATA} or --datadir + + my $last = make_sure_mode_is(); + + my $regex = msg('checkmode-prod'); + if ($last =~ /$regex/) { + $MASTER = 1; + } + + return; + +} ## end of make_sure_production_mode sub finishup { @@ -1919,6 +1922,9 @@ check_dbstats() if $action eq 'dbstats'; ## Check how long since the last checkpoint check_checkpoint() if $action eq 'checkpoint'; +## Check the Datasbae System Identifier +check_cluster_id() if $action eq 'cluster_id'; + ## Check for disabled triggers check_disabled_triggers() if $action eq 'disabled_triggers'; @@ -3114,6 +3120,50 @@ sub perfname { } ## end of perfname; +sub open_controldata { + ## Requires $ENV{PGDATA} or --datadir + + ## Find the data directory, make sure it exists + my $dir = $opt{datadir} || $ENV{PGDATA}; + + if (!defined $dir or ! length $dir) { + ndie msg('checkpoint-nodir'); + } + + if (! -d $dir) { + ndie msg('checkpoint-baddir', $dir); + } + + ## Run pg_controldata + my $pgc + = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} + : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" + : 'pg_controldata'; + $COM = qq{$pgc "$dir"}; + eval { + $res = qx{$COM 2>&1}; + }; + if ($@) { + ndie msg('checkpoint-nosys', $@); + } + + ## If the path is echoed back, we most likely have an invalid data dir + if ($res =~ /$dir/) { + ndie msg('checkpoint-baddir2', $dir); + } + + if ($res =~ /WARNING: Calculated CRC checksum/) { + ndie msg('checkpoint-badver', $dir); + } + if ($res !~ /^pg_control.+\d+/) { + ndie msg('checkpoint-badver2'); + } + + ## return the pg_controldata output + return $res; +} + + sub check_archive_ready { ## Check on the number of WAL archive with status "ready" @@ -3632,7 +3682,6 @@ FROM ( } ## end of check_bloat - sub check_checkpoint { ## Checks how long in seconds since the last checkpoint on a WAL slave @@ -3645,6 +3694,9 @@ sub check_checkpoint { ## may make more sense on the master, or we may want to look at ## the WAL segments received/processed instead of the checkpoint ## timestamp. + ## This checks can use the optionnal --asume-standby-mode or + ## --assume-prod: if the mode found is not the mode assumed, a + ## CRITICAL is emitted. ## Supports: Nagios, MRTG ## Warning and critical are seconds @@ -3657,50 +3709,17 @@ sub check_checkpoint { forcemrtg => 1, }); - ## Find the data directory, make sure it exists - my $dir = $opt{datadir} || $ENV{PGDATA}; - - if (!defined $dir or ! length $dir) { - ndie msg('checkpoint-nodir'); - } - - if (! -d $dir) { - ndie msg('checkpoint-baddir', $dir); - } - $db->{host} = '<none>'; ## Run pg_controldata, grab the time - my $pgc - = $ENV{PGCONTROLDATA} ? $ENV{PGCONTROLDATA} - : $ENV{PGBINDIR} ? "$ENV{PGBINDIR}/pg_controldata" - : 'pg_controldata'; - $COM = qq{$pgc "$dir"}; - eval { - $res = qx{$COM 2>&1}; - }; - if ($@) { - ndie msg('checkpoint-nosys', $@); - } - - ## If the path is echoed back, we most likely have an invalid data dir - if ($res =~ /$dir/) { - ndie msg('checkpoint-baddir2', $dir); - } - - if ($res =~ /WARNING: Calculated CRC checksum/) { - ndie msg('checkpoint-badver', $pgc); - } - if ($res !~ /^pg_control.+\d+/) { - ndie msg('checkpoint-badver2'); - } + $res = open_controldata(); my $regex = msg('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) ## Just in case, check the English one as well $regex = msg_en('checkpoint-po'); if ($res !~ /$regex\s*(.+)/) { - ndie msg('checkpoint-noregex', $dir); + ndie msg('checkpoint-noregex'); } } my $last = $1; @@ -3722,11 +3741,35 @@ sub check_checkpoint { $db->{perf} = sprintf '%s=%s;%s;%s', perfname(msg('age')), $diff, $warning, $critical; + my $mode = ''; + if ($STANDBY) { + $mode = 'STANDBY'; + } + if ($MASTER) { + $mode = 'MASTER'; + } + + ## If we have an assume flag, then honor it. + my $goodmode = 1; + if ($opt{'assume-standby-mode'} and not $STANDBY) { + $goodmode = 0; + $mode = 'NOT STANDBY'; + } + elsif ($opt{'assume-prod'} and not $MASTER) { + $goodmode = 0; + $mode = 'NOT MASTER'; + } + + if (length($mode) > 0) { + $db->{perf} .= sprintf ' %s=%s', + perfname(msg('mode')), $mode; + } + if ($MRTG) { do_mrtg({one => $diff, msg => $msg}); } - if (length $critical and $diff >= $critical) { + if ((length $critical and $diff >= $critical) or not $goodmode) { add_critical $msg; return; } @@ -3743,6 +3786,58 @@ sub check_checkpoint { } ## end of check_checkpoint +sub check_cluster_id { + + + ## Verify the Database System Identifier provided by pg_controldata + ## Supports: Nagios, MRTG + ## One of warning or critical must be given (but not both) + ## It should run one time to find out the expected cluster-id + ## You can use --critical="0" to find out the current cluster-id + ## You can include or exclude settings as well + ## Example: + ## check_postgres_cluster_id --critical="5633695740047915125" + + my ($warning, $critical) = validate_range({type => 'integer', onlyone => 1}); + + $db->{host} = '<none>'; + + ## Run pg_controldata, grab the cluster-id + $res = open_controldata(); + + my $regex = msg('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { ## no critic (ProhibitUnusedCapture) + ## Just in case, check the English one as well + $regex = msg_en('checkcluster-id'); + if ($res !~ /$regex\s*(.+)/) { + ndie msg('checkpoint-noregex'); + } + } + my $ident = $1; + + my $msg = msg('checkcluster-msg', $ident); + if ($MRTG) { + $opt{mrtg} or ndie msg('checksum-nomrtg'); + do_mrtg({one => $opt{mrtg} eq $ident ? 1 : 0, msg => $ident}); + } + if ($critical and $critical ne $ident) { + add_critical $msg; + } + elsif ($warning and $warning ne $ident) { + add_warning $msg; + } + elsif (!$critical and !$warning) { + add_unknown $msg; + } + else { + add_ok $msg; + } + + return; + +} ## end of check_cluster_id + + sub check_commitratio { ## Check the commitratio of one or more databases @@ -7814,6 +7909,16 @@ Example: postgres@db$./check_postgres.pl --action=version --warning=8.1 --datadir /var/lib/postgresql/8.3/main/ --assume-standby-mode POSTGRES_VERSION OK: Server in standby mode | time=0.00 +=item B<--assume-prod> + +If specified, check if server in production mode is performed (--datadir is required). +The option is only relevant for (C<symlink: check_postgres_checkpoint>). + +Example: + + postgres@db$./check_postgres.pl --action=checkpoint --datadir /var/lib/postgresql/8.3/main/ --assume-prod + POSTGRES_CHECKPOINT OK: Last checkpoint was 72 seconds ago | age=72;;300 mode=MASTER + =item B<-h> or B<--help> Displays a help screen with a summary of all actions and options. @@ -8084,6 +8189,8 @@ was run, as determined by parsing the call to C<pg_controldata>. Because of this pg_controldata executable must be available in the current path. Alternatively, you can set the environment variable C<PGCONTROLDATA> to the exact location of the pg_controldata executable, or you can specify C<PGBINDIR> as the directory that it lives in. +It is also possible to use the special options I<--assume-prod> or +I<--assume-standby-mode>, if the mode found is not the one expected, a CRITICAL is emitted. At least one warning or critical argument must be set. @@ -8091,6 +8198,27 @@ This action requires the Date::Parse module. For MRTG or simple output, returns the number of seconds. +=head2 B<cluster_id> + +(C<symlink: check_postgres_cluster-id>) Checks that the Database System Identifier +provided by pg_controldata is the same as last time you checked. This must run on the same +server as the database that is being checked (e.g. the -h flag will not work). +Either the I<--warning> or the I<--critical> option should be given, but not both. The value +of each one is the cluster identifier, an integer value. You can run with the special C<--critical=0> option +to find out an existing cluster identifier. + +Example 1: Find the initial identifier + + check_postgres_cluster_id --critical=0 --datadir=/var//lib/postgresql/9.0/main + +Example 2: Make sure the cluster is the same and warn if not, using the result from above. + + check_postgres_cluster_id --critical=5633695740047915135 + +For MRTG output, returns a 1 or 0 indicating success of failure of the identifier to match. A +identifier must be provided as the C<--mrtg> argument. The fourth line always gives the +current identifier. + =head2 B<commitratio> (C<symlink: check_postgres_commitratio>) Checks the commit ratio of all databases and complains when they are too low. @@ -9366,6 +9494,12 @@ Items not specifically attributed are by GSM (Greg Sabino Mullane). =over 4 +=item B<Version 2.19.0> + + Add the --assume-prod option (Cédric Villemain) + Add the cluster_id check (Cédric Villemain) + Improve settings_checksum and checkpoint tests (Cédric Villemain) + =item B<Version 2.18.1> Do no do an inner join to pg_user when checking database size diff --git a/t/02_checkpoint.t b/t/02_checkpoint.t index 724e695b8..dabee60dc 100644 --- a/t/02_checkpoint.t +++ b/t/02_checkpoint.t @@ -6,7 +6,7 @@ use 5.006; use strict; use warnings; use Data::Dumper; -use Test::More tests => 13; +use Test::More tests => 14; use lib 't','.'; use CP_Testing; @@ -65,6 +65,10 @@ like ($cp->run(qq{-w 1 --datadir="$host"}), qr{^$label WARNING:}, $t); $t=qq{$S returns a critical when checkpoint older than critical option}; like ($cp->run(qq{-c 1 --datadir="$host"}), qr{^$label CRITICAL:}, $t); +# FIXME: no check for --assume-prod, it needs to have a cluster in this state +$t=qq{$S returns a critical when --assume-standby-mode on a non-standby}; +like ($cp->run(qq{-c 1000 --datadir="$host" --assume-standby-mode}), qr{^$label CRITICAL:.*mode=NOT STANDBY}, $t); + $t=qq{$S returns the correct number of seconds}; like ($cp->run(qq{-c 1 --datadir="$host"}), qr{was \d+ seconds ago}, $t); diff --git a/t/02_cluster_id.t b/t/02_cluster_id.t new file mode 100644 index 000000000..6cab394c5 --- /dev/null +++ b/t/02_cluster_id.t @@ -0,0 +1,80 @@ +#!perl + +## Test the "checkpoint" action + +use 5.006; +use strict; +use warnings; +use Data::Dumper; +use Test::More tests => 14; +use lib 't','.'; +use CP_Testing; + +use vars qw/$dbh $result $SQL $t/; + +my $cp = CP_Testing->new( {default_action => 'cluster_id'} ); + +$dbh = $cp->test_database_handle(); + +my $S = q{Action 'cluster_id'}; +my $label = 'POSTGRES_CLUSTER_ID'; + +$t=qq{$S fails when called with an invalid option}; +like ($cp->run('foobar=12'), qr{Usage:}, $t); + +$t=qq{$S fails when called without warning or critical}; +like ($cp->run(''), qr{Must provide a 'warning' or 'critical'}, $t); + +$t=qq{$S fails when called with invalid warning option}; +like ($cp->run('-w foo'), qr{ERROR: .+'warning'.+must be an integer}, $t); + +$t=qq{$S fails when called with invalid critical option}; +like ($cp->run('-c foo'), qr{ERROR: .+'critical'.+must be an integer}, $t); + +$t=qq{$S fails when called without a datadir option and PGDATA is not set}; +delete $ENV{PGDATA}; +like ($cp->run('-c 10'), qr{^ERROR: Must supply a --datadir}, $t); + +$t=qq{$S fails when called with an invalid datadir option and PGDATA is not set}; +like ($cp->run('-c 10 --datadir=foobar'), qr{^ERROR: Invalid data_directory}, $t); + +$t = qq{$S rejects -w and -c together}; +is ($cp->run('-w 123 -c 123'), + qq{ERROR: Can only provide 'warning' OR 'critical' option\n}, $t); + +my $host = $cp->get_dbhost(); + +$t=qq{$S fails when called against a non datadir datadir}; +like ($cp->run(qq{-c 10 --datadir="$host"}), + qr{^ERROR:.+could not read the given data directory}, $t); + +$host =~ s/socket$//; + +$t = qq{$S notes mismatched cluster_id (warning)}; +like ($cp->run(qq{-w 123 --datadir="$host"}), + qr{$label WARNING: .* cluster_id:}, $t); + +$t = qq{$S notes mismatched cluster_id (critical)}; +like ($cp->run(qq{-c 123 --datadir="$host"}), + qr{$label CRITICAL: .* cluster_id:}, $t); + +$t = qq{$S self-identifies correctly}; +$result = $cp->run(qq{--critical 0 --datadir="$host"}); +like ($result, qr{^$label UNKNOWN: +cluster_id: \d+}, $t); + +my $true_cluster_id; +$true_cluster_id = $1 if $result =~ /cluster_id: (\d{19})/; + +$t = qq{$S accepts matching cluster_id}; +like ($cp->run(qq{-w $true_cluster_id --datadir="$host"}), + qr/OK.*\Qcluster_id: $true_cluster_id\E/, $t); + +$t=qq{$S returns the expected output for MRTG(failure)}; +like ($cp->run(qq{--mrtg 123 --output=MRTG --datadir="$host"}), + qr{^0\n0\n\n\d+}, $t); + +$t=qq{$S returns the expected output for MRTG(success)}; +like ($cp->run(qq{--mrtg $true_cluster_id --output=MRTG --datadir="$host"}), + qr{^1\n0\n\n\d+}, $t); + +exit; diff --git a/t/02_settings_checksum.t b/t/02_settings_checksum.t index 08decc8ce..a5649754b 100644 --- a/t/02_settings_checksum.t +++ b/t/02_settings_checksum.t @@ -6,7 +6,7 @@ use 5.006; use strict; use warnings; use Data::Dumper; -use Test::More tests => 8; +use Test::More tests => 10; use lib 't','.'; use CP_Testing; @@ -52,4 +52,12 @@ like ($cp->run('-c abcdabcdabcdabcdabcdabcdabcdabcd'), $t = qq{$S accepts matching checksum}; like ($cp->run("-w $true_checksum"), qr/OK.*\Qchecksum: $true_checksum\E/, $t); +$t=qq{$S returns the expected output for MRTG(failure)}; +like ($cp->run(qq{--mrtg 123 --output=MRTG}), + qr{^0\n0\n\n\d+}, $t); + +$t=qq{$S returns the expected output for MRTG(success)}; +like ($cp->run(qq{--mrtg $true_checksum --output=MRTG}), + qr{^1\n0\n\n\d+}, $t); + exit; |