diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/nagios_blocker_checker.pl | 183 |
1 files changed, 113 insertions, 70 deletions
diff --git a/scripts/nagios_blocker_checker.pl b/scripts/nagios_blocker_checker.pl index 27947f03d..b9168b085 100755 --- a/scripts/nagios_blocker_checker.pl +++ b/scripts/nagios_blocker_checker.pl @@ -11,14 +11,16 @@ use strict; use warnings; use lib qw(. lib local/lib/perl5); - use Bugzilla; use Bugzilla::Constants; use Bugzilla::Product; use Bugzilla::User; +use Bugzilla::Sentry; use Getopt::Long; Bugzilla->usage_mode(USAGE_MODE_CMDLINE); +Bugzilla->error_mode(ERROR_MODE_DIE); +use Try::Tiny; # bmo ships with this nowadays my $config = { # filter by assignee, product or component @@ -37,6 +39,12 @@ my $config = { blocker_warn => 0, any_alarm => 24, any_warn => 20, + # time in seconds before terminating this script + # 300 chosen as it is longer than the default NRPE timeout + # (meaning you should never need to tweak it upward) and + # shorter than what you are likely to do checking bugs + # (meaning you won't pile up too many instances before they die) + max_runtime => 300, }; my $usage = <<EOF; @@ -78,6 +86,12 @@ TIMING --any_alarm <hours> (default: $config->{any_alarm}) --any_warn <hours> (default: $config->{any_warn}) +NAGIOS SELF-TERMINATION + + In case of a hung process, this script self-terminates. You can adjust: + + --max_runtime <seconds> (default: $config->{max_runtime}) + EXAMPLES nagios_blocker_checker.pl --assignee server-ops\@mozilla-org.bugs @@ -100,6 +114,7 @@ die($usage) unless GetOptions( 'blocker_warn=i' => \$config->{blocker_warn}, 'any_alarm=i' => \$config->{any_alarm}, 'any_warn=i' => \$config->{any_warn}, + 'max_runtime=i' => \$config->{max_runtime}, 'help|?' => \$config->{help}, ); $config->{assignee} = $ARGV[0] if !$config->{assignee} && @ARGV; @@ -117,83 +132,111 @@ use constant NAGIOS_WARNING => 1; use constant NAGIOS_CRITICAL => 2; use constant NAGIOS_NAMES => [qw( OK WARNING CRITICAL )]; -my $dbh = Bugzilla->switch_to_shadow_db; -my $any_severity = $config->{severity} eq 'any'; -my ($where, @values); - -if ($config->{assignee}) { - $where = 'bugs.assigned_to = ?'; - push @values, Bugzilla::User->check({ name => $config->{assignee} })->id; - -} elsif ($config->{component}) { - $where = 'bugs.product_id = ? AND bugs.component_id = ? AND bugs.assigned_to = ?'; - my $product = Bugzilla::Product->check({ name => $config->{product} }); - push @values, $product->id; - push @values, Bugzilla::Component->check({ product => $product, name => $config->{component} })->id; - push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id; - -} else { - $where = 'bugs.product_id = ? AND bugs.assigned_to = ?'; - push @values, Bugzilla::Product->check({ name => $config->{product} })->id; - push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id; -} - -if (!$any_severity) { - $where .= ' AND bug_severity IN (' . - join(',', map { $dbh->quote($_) } split(/,/, $config->{severity})) . ')'; -} - -my $sql = <<EOF; - SELECT bug_id, bug_severity, UNIX_TIMESTAMP(bugs.creation_ts) AS ts - FROM bugs - WHERE $where - AND COALESCE(resolution, '') = '' -EOF - -my $bugs = { - 'major' => [], - 'critical' => [], - 'blocker' => [], - 'any' => [], -}; my $current_state = NAGIOS_OK; -my $current_time = time; +try { + # Per bug 1330293, the checker script can get confused/hung up + # if the DB rotates out from under it. Since a long-running + # nagios check does no good, we terminate if we stick around too long. + local $SIG{ALRM} = sub { + my $message = "$0 ran for longer than ".$config->{max_runtime}." seconds and was auto-terminated."; + sentry_handle_error('error', $message); + die "$message\n"; + }; + alarm($config->{max_runtime}); + + my $dbh = Bugzilla->switch_to_shadow_db; + my $any_severity = $config->{severity} eq 'any'; + my ($where, @values); + + if ($config->{assignee}) { + $where = 'bugs.assigned_to = ?'; + push @values, Bugzilla::User->check({ name => $config->{assignee} })->id; + + } elsif ($config->{component}) { + $where = 'bugs.product_id = ? AND bugs.component_id = ? AND bugs.assigned_to = ?'; + my $product = Bugzilla::Product->check({ name => $config->{product} }); + push @values, $product->id; + push @values, Bugzilla::Component->check({ product => $product, name => $config->{component} })->id; + push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id; + + } else { + $where = 'bugs.product_id = ? AND bugs.assigned_to = ?'; + push @values, Bugzilla::Product->check({ name => $config->{product} })->id; + push @values, Bugzilla::User->check({ name => $config->{unassigned} })->id; + } -foreach my $bug (@{ $dbh->selectall_arrayref($sql, { Slice => {} }, @values) }) { - my $severity = $any_severity ? 'any' : $bug->{bug_severity}; - my $age = ($current_time - $bug->{ts}) / 3600; + if (!$any_severity) { + $where .= ' AND bug_severity IN (' . + join(',', map { $dbh->quote($_) } split(/,/, $config->{severity})) . ')'; + } - if ($age > $config->{"${severity}_alarm"}) { - $current_state = NAGIOS_CRITICAL; - push @{$bugs->{$severity}}, $bug->{bug_id}; + my $sql = <<" EOF"; + SELECT bug_id, bug_severity, UNIX_TIMESTAMP(bugs.creation_ts) AS ts + FROM bugs + WHERE $where + AND COALESCE(resolution, '') = '' + EOF + + my $bugs = { + 'major' => [], + 'critical' => [], + 'blocker' => [], + 'any' => [], + }; + my $current_time = time; + + foreach my $bug (@{ $dbh->selectall_arrayref($sql, { Slice => {} }, @values) }) { + my $severity = $any_severity ? 'any' : $bug->{bug_severity}; + my $age = ($current_time - $bug->{ts}) / 3600; + + if ($age > $config->{"${severity}_alarm"}) { + $current_state = NAGIOS_CRITICAL; + push @{$bugs->{$severity}}, $bug->{bug_id}; + + } elsif ($age > $config->{"${severity}_warn"}) { + if ($current_state < NAGIOS_WARNING) { + $current_state = NAGIOS_WARNING; + } + push @{$bugs->{$severity}}, $bug->{bug_id}; - } elsif ($age > $config->{"${severity}_warn"}) { - if ($current_state < NAGIOS_WARNING) { - $current_state = NAGIOS_WARNING; } - push @{$bugs->{$severity}}, $bug->{bug_id}; - } -} -print "bugs " . NAGIOS_NAMES->[$current_state] . ": "; -if ($current_state == NAGIOS_OK) { - if ($config->{severity} eq 'any') { - print "No unassigned bugs found."; - } else { - print "No $config->{severity} bugs found." + print "bugs " . NAGIOS_NAMES->[$current_state] . ": "; + if ($current_state == NAGIOS_OK) { + if ($config->{severity} eq 'any') { + print "No unassigned bugs found."; + } else { + print "No $config->{severity} bugs found." + } } -} -foreach my $severity (qw( blocker critical major any )) { - my $list = $bugs->{$severity}; - if (@$list) { - printf - "%s %s %s found https://bugzil.la/" . join(',', @$list) . " ", - scalar(@$list), - ($any_severity ? 'unassigned' : $severity), - (scalar(@$list) == 1 ? 'bug' : 'bugs'); + foreach my $severity (qw( blocker critical major any )) { + my $list = $bugs->{$severity}; + if (@$list) { + printf + "%s %s %s found https://bugzil.la/" . join(',', @$list) . " ", + scalar(@$list), + ($any_severity ? 'unassigned' : $severity), + (scalar(@$list) == 1 ? 'bug' : 'bugs'); + } } -} -print "\n"; + print "\n"; + alarm(0); +} catch { + # Anything that trips an error, we're calling nagios-critical + $current_state = NAGIOS_CRITICAL; + # + # Templates often have linebreaks ; nagios really prefers a status + # to be on one line. Here we strip out breaks, and try to make sure + # there's spacing in place when we crunch those lines together. + s#\s?\r?\n# #g; + # + # Now, just print the status we got out. + # Keep in mind, depending on when 'try' blew out, we may have + # already printed SOMETHING. Can't help that without a much more + # thorough fix. Our majority case here is a blowout from BZ + # where a Product/Component went away, ala bug 1326233. + print "$_\n"; +}; exit $current_state; |