From 9a8f8232509ec61f798f4872ab38741762725a49 Mon Sep 17 00:00:00 2001 From: gcoxmoz Date: Tue, 2 Jan 2018 17:02:12 +0000 Subject: Bug 1330293 - Prevent nagios_blocker_checker.pl from running longer than 5 minutes (and log to sentry if it does) --- contrib/nagios_blocker_checker.pl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'contrib') diff --git a/contrib/nagios_blocker_checker.pl b/contrib/nagios_blocker_checker.pl index 9aebe4ae6..b9168b085 100755 --- a/contrib/nagios_blocker_checker.pl +++ b/contrib/nagios_blocker_checker.pl @@ -15,6 +15,7 @@ use Bugzilla; use Bugzilla::Constants; use Bugzilla::Product; use Bugzilla::User; +use Bugzilla::Sentry; use Getopt::Long; Bugzilla->usage_mode(USAGE_MODE_CMDLINE); @@ -38,6 +39,12 @@ my $config = { blocker_warn => 0, any_alarm => 24, any_warn => 20, + # time in seconds before terminating this script + # 300 chosen as it is longer than the default NRPE timeout + # (meaning you should never need to tweak it upward) and + # shorter than what you are likely to do checking bugs + # (meaning you won't pile up too many instances before they die) + max_runtime => 300, }; my $usage = < (default: $config->{any_alarm}) --any_warn (default: $config->{any_warn}) +NAGIOS SELF-TERMINATION + + In case of a hung process, this script self-terminates. You can adjust: + + --max_runtime (default: $config->{max_runtime}) + EXAMPLES nagios_blocker_checker.pl --assignee server-ops\@mozilla-org.bugs @@ -101,6 +114,7 @@ die($usage) unless GetOptions( 'blocker_warn=i' => \$config->{blocker_warn}, 'any_alarm=i' => \$config->{any_alarm}, 'any_warn=i' => \$config->{any_warn}, + 'max_runtime=i' => \$config->{max_runtime}, 'help|?' => \$config->{help}, ); $config->{assignee} = $ARGV[0] if !$config->{assignee} && @ARGV; @@ -120,6 +134,16 @@ use constant NAGIOS_NAMES => [qw( OK WARNING CRITICAL )]; my $current_state = NAGIOS_OK; try { + # Per bug 1330293, the checker script can get confused/hung up + # if the DB rotates out from under it. Since a long-running + # nagios check does no good, we terminate if we stick around too long. + local $SIG{ALRM} = sub { + my $message = "$0 ran for longer than ".$config->{max_runtime}." seconds and was auto-terminated."; + sentry_handle_error('error', $message); + die "$message\n"; + }; + alarm($config->{max_runtime}); + my $dbh = Bugzilla->switch_to_shadow_db; my $any_severity = $config->{severity} eq 'any'; my ($where, @values); @@ -197,6 +221,7 @@ try { } } print "\n"; + alarm(0); } catch { # Anything that trips an error, we're calling nagios-critical $current_state = NAGIOS_CRITICAL; -- cgit v1.2.3-24-g4f1b