summaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/nagios_blocker_checker.pl183
1 files changed, 113 insertions, 70 deletions
diff --git a/scripts/nagios_blocker_checker.pl b/scripts/nagios_blocker_checker.pl
index 27947f03d..b9168b085 100755
--- a/scripts/nagios_blocker_checker.pl
+++ b/scripts/nagios_blocker_checker.pl
@@ -11,14 +11,16 @@ use strict;
use warnings;
use lib qw(. lib local/lib/perl5);
-
use Bugzilla;
use Bugzilla::Constants;
use Bugzilla::Product;
use Bugzilla::User;
+use Bugzilla::Sentry;
use Getopt::Long;
Bugzilla->usage_mode(USAGE_MODE_CMDLINE);
+Bugzilla->error_mode(ERROR_MODE_DIE);
+use Try::Tiny; # bmo ships with this nowadays
my $config = {
# filter by assignee, product or component
@@ -37,6 +39,12 @@ my $config = {
blocker_warn => 0,
any_alarm => 24,
any_warn => 20,
+ # time in seconds before terminating this script
+ # 300 chosen as it is longer than the default NRPE timeout
+ # (meaning you should never need to tweak it upward) and
+ # shorter than what you are likely to do checking bugs
+ # (meaning you won't pile up too many instances before they die)
+ max_runtime => 300,
};
my $usage = <<EOF;
@@ -78,6 +86,12 @@ TIMING
--any_alarm <hours> (default: $config->{any_alarm})
--any_warn <hours> (default: $config->{any_warn})
+NAGIOS SELF-TERMINATION
+
+ In case of a hung process, this script self-terminates. You can adjust:
+
+ --max_runtime <seconds> (default: $config->{max_runtime})
+
EXAMPLES
nagios_blocker_checker.pl --assignee server-ops\@mozilla-org.bugs
@@ -100,6 +114,7 @@ die($usage) unless GetOptions(
'blocker_warn=i' => \$config->{blocker_warn},
'any_alarm=i' => \$config->{any_alarm},
'any_warn=i' => \$config->{any_warn},
+ 'max_runtime=i' => \$config->{max_runtime},
'help|?' => \$config->{help},
);
$config->{assignee} = $ARGV[0] if !$config->{assignee} && @ARGV;
@@ -117,83 +132,111 @@ use constant NAGIOS_WARNING => 1;
use constant NAGIOS_CRITICAL => 2;
use constant NAGIOS_NAMES => [qw( OK WARNING CRITICAL )];
-my $dbh = Bugzilla->switch_to_shadow_db;
-my $any_severity = $config->{severity} eq 'any';
-my ($where, @values);
-
-if ($config->{assignee}) {
- $where = 'bugs.assigned_to = ?';
- push @values, Bugzilla::User->check({ name => $config->{assignee} })->id;
-
-} elsif ($config->{component}) {
- $where = 'bugs.product_id = ? AND bugs.component_id = ? AND bugs.assigned_to = ?';
- my $product = Bugzilla::Product->check({ name => $config->{product} });
- push @values, $product->id;
- push @values, Bugzilla::Component->check({ product => $product, name => $config->{component} })->id;
- push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id;
-
-} else {
- $where = 'bugs.product_id = ? AND bugs.assigned_to = ?';
- push @values, Bugzilla::Product->check({ name => $config->{product} })->id;
- push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id;
-}
-
-if (!$any_severity) {
- $where .= ' AND bug_severity IN (' .
- join(',', map { $dbh->quote($_) } split(/,/, $config->{severity})) . ')';
-}
-
-my $sql = <<EOF;
- SELECT bug_id, bug_severity, UNIX_TIMESTAMP(bugs.creation_ts) AS ts
- FROM bugs
- WHERE $where
- AND COALESCE(resolution, '') = ''
-EOF
-
-my $bugs = {
- 'major' => [],
- 'critical' => [],
- 'blocker' => [],
- 'any' => [],
-};
my $current_state = NAGIOS_OK;
-my $current_time = time;
+try {
+ # Per bug 1330293, the checker script can get confused/hung up
+ # if the DB rotates out from under it. Since a long-running
+ # nagios check does no good, we terminate if we stick around too long.
+ local $SIG{ALRM} = sub {
+ my $message = "$0 ran for longer than ".$config->{max_runtime}." seconds and was auto-terminated.";
+ sentry_handle_error('error', $message);
+ die "$message\n";
+ };
+ alarm($config->{max_runtime});
+
+ my $dbh = Bugzilla->switch_to_shadow_db;
+ my $any_severity = $config->{severity} eq 'any';
+ my ($where, @values);
+
+ if ($config->{assignee}) {
+ $where = 'bugs.assigned_to = ?';
+ push @values, Bugzilla::User->check({ name => $config->{assignee} })->id;
+
+ } elsif ($config->{component}) {
+ $where = 'bugs.product_id = ? AND bugs.component_id = ? AND bugs.assigned_to = ?';
+ my $product = Bugzilla::Product->check({ name => $config->{product} });
+ push @values, $product->id;
+ push @values, Bugzilla::Component->check({ product => $product, name => $config->{component} })->id;
+ push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id;
+
+ } else {
+ $where = 'bugs.product_id = ? AND bugs.assigned_to = ?';
+ push @values, Bugzilla::Product->check({ name => $config->{product} })->id;
+ push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id;
+ }
-foreach my $bug (@{ $dbh->selectall_arrayref($sql, { Slice => {} }, @values) }) {
- my $severity = $any_severity ? 'any' : $bug->{bug_severity};
- my $age = ($current_time - $bug->{ts}) / 3600;
+ if (!$any_severity) {
+ $where .= ' AND bug_severity IN (' .
+ join(',', map { $dbh->quote($_) } split(/,/, $config->{severity})) . ')';
+ }
- if ($age > $config->{"${severity}_alarm"}) {
- $current_state = NAGIOS_CRITICAL;
- push @{$bugs->{$severity}}, $bug->{bug_id};
+ my $sql = <<" EOF";
+ SELECT bug_id, bug_severity, UNIX_TIMESTAMP(bugs.creation_ts) AS ts
+ FROM bugs
+ WHERE $where
+ AND COALESCE(resolution, '') = ''
+ EOF
+
+ my $bugs = {
+ 'major' => [],
+ 'critical' => [],
+ 'blocker' => [],
+ 'any' => [],
+ };
+ my $current_time = time;
+
+ foreach my $bug (@{ $dbh->selectall_arrayref($sql, { Slice => {} }, @values) }) {
+ my $severity = $any_severity ? 'any' : $bug->{bug_severity};
+ my $age = ($current_time - $bug->{ts}) / 3600;
+
+ if ($age > $config->{"${severity}_alarm"}) {
+ $current_state = NAGIOS_CRITICAL;
+ push @{$bugs->{$severity}}, $bug->{bug_id};
+
+ } elsif ($age > $config->{"${severity}_warn"}) {
+ if ($current_state < NAGIOS_WARNING) {
+ $current_state = NAGIOS_WARNING;
+ }
+ push @{$bugs->{$severity}}, $bug->{bug_id};
- } elsif ($age > $config->{"${severity}_warn"}) {
- if ($current_state < NAGIOS_WARNING) {
- $current_state = NAGIOS_WARNING;
}
- push @{$bugs->{$severity}}, $bug->{bug_id};
-
}
-}
-print "bugs " . NAGIOS_NAMES->[$current_state] . ": ";
-if ($current_state == NAGIOS_OK) {
- if ($config->{severity} eq 'any') {
- print "No unassigned bugs found.";
- } else {
- print "No $config->{severity} bugs found."
+ print "bugs " . NAGIOS_NAMES->[$current_state] . ": ";
+ if ($current_state == NAGIOS_OK) {
+ if ($config->{severity} eq 'any') {
+ print "No unassigned bugs found.";
+ } else {
+ print "No $config->{severity} bugs found."
+ }
}
-}
-foreach my $severity (qw( blocker critical major any )) {
- my $list = $bugs->{$severity};
- if (@$list) {
- printf
- "%s %s %s found https://bugzil.la/" . join(',', @$list) . " ",
- scalar(@$list),
- ($any_severity ? 'unassigned' : $severity),
- (scalar(@$list) == 1 ? 'bug' : 'bugs');
+ foreach my $severity (qw( blocker critical major any )) {
+ my $list = $bugs->{$severity};
+ if (@$list) {
+ printf
+ "%s %s %s found https://bugzil.la/" . join(',', @$list) . " ",
+ scalar(@$list),
+ ($any_severity ? 'unassigned' : $severity),
+ (scalar(@$list) == 1 ? 'bug' : 'bugs');
+ }
}
-}
-print "\n";
+ print "\n";
+ alarm(0);
+} catch {
+ # Anything that trips an error, we're calling nagios-critical
+ $current_state = NAGIOS_CRITICAL;
+ #
+ # Templates often have linebreaks ; nagios really prefers a status
+ # to be on one line. Here we strip out breaks, and try to make sure
+ # there's spacing in place when we crunch those lines together.
+ s#\s?\r?\n# #g;
+ #
+ # Now, just print the status we got out.
+ # Keep in mind, depending on when 'try' blew out, we may have
+ # already printed SOMETHING. Can't help that without a much more
+ # thorough fix. Our majority case here is a blowout from BZ
+ # where a Product/Component went away, ala bug 1326233.
+ print "$_\n";
+};
exit $current_state;