summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNiko Tyni <ntyni@iki.fi>2005-09-14 21:44:00 +0200
committerNiko Tyni <ntyni@iki.fi>2005-09-14 21:44:00 +0200
commitf0021857e52510645acf59997de6b5ee952186c3 (patch)
treeff20064d52eebad9d83ca9d1cebeff62e9795c3a
parentd788fb4fa54adfb8cc2d8d0344bdb9a7af2d642d (diff)
downloadsmokeping-f0021857e52510645acf59997de6b5ee952186c3.tar.gz
smokeping-f0021857e52510645acf59997de6b5ee952186c3.tar.xz
* (trunk)/
lib/Smokeping.pm, bin/smokeping.dist, CHANGES, TODO: + reload the config file on HUP signal and bail out if it's broken. After reloading, restart gracefully so that no measurement is interrupted. The signal can also be triggered by 'smokeping --reload'.
-rw-r--r--CHANGES5
-rw-r--r--TODO9
-rwxr-xr-xbin/smokeping.dist3
-rw-r--r--lib/Smokeping.pm206
4 files changed, 174 insertions, 49 deletions
diff --git a/CHANGES b/CHANGES
index 86f19f2..405a623 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,8 @@
+* reload the config file on HUP signal and bail out if it's broken.
+ After reloading, restart gracefully so that no measurement is
+ interrupted. The signal can also be triggered by 'smokeping --reload'.
+ -- niko, suggested by Taisuke Yamada <tyamadajp *list.rakugaki.org>
+
2005/9/14 -- released version 20050914_trunk
* made some alert docs in smokeping_config visible --niko
diff --git a/TODO b/TODO
index cd6e866..a2cb5d0 100644
--- a/TODO
+++ b/TODO
@@ -20,15 +20,6 @@
this through) visible (eg. different colour) in the CGI menu
- suggested by Cornel Badea <cornel *sbhost.ro>
-* DAEMON
- reread config periodically or with SIGHUP so that
- no measurement is interrupted
- - suggested by Taisuke Yamada,
- <http://lists.ee.ethz.ch/smokeping-users/msg01445.html>
- - concurrent probe processes need a signal anyway,
- they have to exit and new ones started so that
- we don't have to compare to the old config
-
* REMOTE EXECUTION
generic remote probe
- a possibility for basefork-derived probes to reuse the same
diff --git a/bin/smokeping.dist b/bin/smokeping.dist
index 5c50cb8..9980369 100755
--- a/bin/smokeping.dist
+++ b/bin/smokeping.dist
@@ -38,6 +38,9 @@ B<smokeping> [ B<--email> | B<--makepod> | B<--version> | B<--restart> ]
--restart Restart SmokePing
+ --reload Reload configuration in the running process without interrupting
+ any probes
+
--nodaemon Do no daemonize the process (no fork)
--filter=x Only measure entries which pass the filter x
diff --git a/lib/Smokeping.pm b/lib/Smokeping.pm
index e1de056..9ae9c63 100644
--- a/lib/Smokeping.pm
+++ b/lib/Smokeping.pm
@@ -1776,6 +1776,7 @@ be appended to the process name as '[probe]', eg. '/usr/bin/smokeping
[FPing]'. If you don't like this behaviour, set this variable to 'no'.
If 'concurrentprobes' is not set to 'yes', this variable has no effect.
DOC
+ _default => 'yes',
},
tmail =>
{
@@ -2395,16 +2396,23 @@ sub get_config ($$){
return $parser->parse( $cfgfile ) || die "ERROR: $parser->{err}\n";
}
-sub kill_smoke ($) {
+sub kill_smoke ($$) {
my $pidfile = shift;
+ my $signal = shift;
if (defined $pidfile){
if ( -f $pidfile && open PIDFILE, "<$pidfile" ) {
<PIDFILE> =~ /(\d+)/;
my $pid = $1;
- kill 2, $pid if kill 0, $pid;
- sleep 3; # let it die
- die "ERROR: Can not stop running instance of SmokePing ($pid)\n"
- if kill 0, $pid;
+ if ($signal == SIGINT || $signal == SIGTERM) {
+ kill $signal, $pid if kill 0, $pid;
+ sleep 3; # let it die
+ die "ERROR: Can not stop running instance of SmokePing ($pid)\n"
+ if kill 0, $pid;
+ } else {
+ die "ERROR: no instance of SmokePing running (pid $pid)?\n"
+ unless kill 0, $pid;
+ kill $signal, $pid;
+ }
close PIDFILE;
} else {
die "ERROR: Can not read pid from $pidfile: $!\n";
@@ -2487,7 +2495,9 @@ sub daemonize_me ($) {
}
sub do_syslog ($){
- syslog("$syslog_facility|$syslog_priority", shift);
+ my $str = shift;
+ $str =~ s,%,%%,g;
+ syslog("$syslog_facility|$syslog_priority", $str);
}
sub do_cgilog ($){
@@ -2540,7 +2550,9 @@ sub load_cfg ($) {
$cfg->{__probes} = $probes;
init_alerts $cfg if $cfg->{Alerts};
init_target_tree $cfg, $probes, $cfg->{Targets}, $cfg->{General}{datadir};
- }
+ } else {
+ do_log("Config file unmodified, skipping reload") unless $cgimode;
+ }
}
@@ -2762,6 +2774,40 @@ sub verify_cfg {
print "Configuration file '$cfgfile' syntax OK.\n";
}
+sub make_kid {
+ my $sleep_count = 0;
+ my $pid;
+ do {
+ $pid = fork;
+ unless (defined $pid) {
+ do_log("Fatal: cannot fork: $!");
+ die "bailing out"
+ if $sleep_count++ > 6;
+ sleep 10;
+ }
+ } until defined $pid;
+ srand();
+ return $pid;
+}
+
+sub start_probes {
+ my $pids = shift;
+ my $pid;
+ my $myprobe;
+ for my $p (keys %$probes) {
+ if ($probes->{$p}->target_count == 0) {
+ do_log("No targets defined for probe $p, skipping.");
+ next;
+ }
+ $pid = make_kid();
+ $myprobe = $p;
+ $pids->{$pid} = $p;
+ last unless $pid;
+ do_log("Child process $pid started for probe $p.");
+ }
+ return $pid;
+}
+
sub main (;$) {
$cgimode = 0;
umask 022;
@@ -2769,7 +2815,7 @@ sub main (;$) {
$opt{filter}=[];
GetOptions(\%opt, 'version', 'email', 'man:s','help','logfile=s','static-pages:s', 'debug-daemon',
'nosleep', 'makepod:s','debug','restart', 'filter=s', 'nodaemon|nodemon',
- 'config=s', 'check', 'gen-examples') or pod2usage(2);
+ 'config=s', 'check', 'gen-examples', 'reload') or pod2usage(2);
if($opt{version}) { print "$RCS_VERSION\n"; exit(0) };
if(exists $opt{man}) {
if ($opt{man}) {
@@ -2802,7 +2848,12 @@ sub main (;$) {
load_cfg $cfgfile;
if(defined $opt{'static-pages'}) { makestaticpages $cfg, $opt{'static-pages'}; exit 0 };
if($opt{email}) { enable_dynamic $cfg, $cfg->{Targets},"",""; exit 0 };
- if($opt{restart}) { kill_smoke $cfg->{General}{piddir}."/smokeping.pid";};
+ if($opt{restart}) { kill_smoke $cfg->{General}{piddir}."/smokeping.pid", SIGINT;};
+ if($opt{reload}) {
+ kill_smoke $cfg->{General}{piddir}."/smokeping.pid", SIGHUP;
+ print "HUP signal sent to the running SmokePing process, exiting.\n";
+ exit 0;
+ };
if($opt{logfile}) { initialize_filelog($opt{logfile}) };
if (not keys %$probes) {
do_log("No probes defined, exiting.");
@@ -2817,35 +2868,22 @@ sub main (;$) {
}
do_log "Smokeping version $VERSION successfully launched.";
+RESTART:
my $myprobe;
+ my $multiprocessmode;
my $forkprobes = $cfg->{General}{concurrentprobes} || 'yes';
if ($forkprobes eq "yes" and keys %$probes > 1 and not $opt{debug}) {
+ $multiprocessmode = 1;
my %probepids;
my $pid;
do_log("Entering multiprocess mode.");
- for my $p (keys %$probes) {
- if ($probes->{$p}->target_count == 0) {
- do_log("No targets defined for probe $p, skipping.");
- next;
- }
- my $sleep_count = 0;
- do {
- $pid = fork;
- unless (defined $pid) {
- do_log("Fatal: cannot fork: $!");
- die "bailing out"
- if $sleep_count++ > 6;
- sleep 10;
- }
- } until defined $pid;
- $myprobe = $p;
- goto KID unless $pid; # child skips rest of loop
- do_log("Child process $pid started for probe $myprobe.");
- $probepids{$pid} = $myprobe;
- }
+ $pid = start_probes(\%probepids);
+ $myprobe = $probepids{$pid};
+ goto KID unless $pid; # child skips rest of loop
# parent
do_log("All probe processes started successfully.");
my $exiting = 0;
+ my $reloading = 0;
for my $sig (qw(INT TERM)) {
$SIG{$sig} = sub {
do_log("Got $sig signal, terminating child processes.");
@@ -2854,7 +2892,7 @@ sub main (;$) {
my $now = time;
while(keys %probepids) { # SIGCHLD handler below removes the keys
if (time - $now > 2) {
- do_log("Can't terminate all child processes, giving up.");
+ do_log("Fatal: can't terminate all child processes, giving up.");
exit 1;
}
sleep 1;
@@ -2868,14 +2906,64 @@ sub main (;$) {
my $p = $probepids{$dead};
$p = 'unknown' unless defined $p;
do_log("Child process $dead (probe $p) exited unexpectedly with status $?.")
- unless $exiting;
+ unless $exiting or $reloading;
delete $probepids{$dead};
}
};
- sleep while 1; # just wait for the signals
+ my $gothup = 0;
+ $SIG{HUP} = sub {
+ do_debuglog("Got HUP signal.");
+ $gothup = 1;
+ };
+ while (1) { # just wait for the signals
+ sleep;
+ next unless $gothup;
+ $reloading = 1;
+ $gothup = 0;
+ my $oldprobes = $probes;
+ $reloading = 0, next unless reload_cfg($cfgfile);
+ do_debuglog("Restarting probe processes " . join(",", keys %probepids) . ".");
+ kill SIGHUP, $_ for (keys %probepids);
+ my $i=0;
+ while (keys %probepids) {
+ sleep 1;
+ if ($i % 10 == 0) {
+ do_log("Waiting for child processes to terminate.");
+ }
+ $i++;
+ my %termsent;
+ for (keys %probepids) {
+ my $step = $oldprobes->{$probepids{$_}}->step;
+ if ($i > $step) {
+ do_log("Child process $_ took over its step value to terminate, killing it with SIGTERM");
+ if (kill SIGTERM, $_ == 0 and exists $probepids{$_}) {
+ do_log("Fatal: Child process $_ has disappeared? This shouldn't happen. Giving up.");
+ exit 1;
+ } else {
+ $termsent{$_} = time;
+ }
+ }
+ for (keys %termsent) {
+ if (exists $probepids{$_}) {
+ if (time() - $termsent{$_} > 2) {
+ do_log("Fatal: Child process $_ took over 2 seconds to exit on TERM signal. Giving up.");
+ exit 1;
+ }
+ } else {
+ delete $termsent{$_};
+ }
+ }
+ }
+ }
+ $reloading = 0;
+ do_log("Child processes terminated, restarting with new configuration.");
+ $SIG{CHLD} = 'DEFAULT'; # restore
+ goto RESTART;
+ }
do_log("Exiting abnormally - this should not happen.");
exit 1; # not reached
} else {
+ $multiprocessmode = 0;
if ($forkprobes ne "yes") {
do_log("Not entering multiprocess mode because the 'concurrentprobes' variable is not set.");
for my $p (keys %$probes) {
@@ -2890,21 +2978,26 @@ sub main (;$) {
do_log("Not entering multiprocess mode for just a single probe.");
$myprobe = (keys %$probes)[0]; # this way we won't ignore a probe-specific step parameter
}
- for my $sig (qw(INT TERM)) {
- $SIG{$sig} = sub {
- do_log("Got $sig signal, terminating.");
- exit 1;
- }
- }
}
KID:
my $offset;
my $step;
+ my $gothup = 0;
+ my $changeprocessnames = $cfg->{General}{changeprocessnames} ne "no";
+ $SIG{HUP} = sub {
+ do_log("Got HUP signal, " . ($multiprocessmode ? "exiting" : "restarting") . " gracefully.");
+ $gothup = 1;
+ };
+ for my $sig (qw(INT TERM)) {
+ $SIG{$sig} = sub {
+ do_log("got $sig signal, terminating.");
+ exit 1;
+ }
+ }
if (defined $myprobe) {
$offset = $probes->{$myprobe}->offset || 'random';
$step = $probes->{$myprobe}->step;
- $0 .= " [$myprobe]" unless defined $cfg->{General}{changeprocessnames}
- and $cfg->{General}{changeprocessnames} eq "no";
+ $0 .= " [$myprobe]" if $changeprocessnames;
} else {
$offset = $cfg->{General}{offset} || 'random';
$step = $cfg->{Database}{step};
@@ -2935,6 +3028,7 @@ KID:
do_debuglog("Sleeping $sleeptime seconds.");
}
sleep $sleeptime;
+ last if checkhup($multiprocessmode, $gothup) && reload_cfg($cfgfile);
}
my $now = time;
run_probes $probes, $myprobe; # $myprobe is undef if running without 'concurrentprobes'
@@ -2951,9 +3045,41 @@ KID:
do_log($warn);
}
}
+ last if checkhup($multiprocessmode, $gothup) && reload_cfg($cfgfile);
}
+ $0 =~ s/ \[$myprobe\]$// if $changeprocessnames;
+ goto RESTART;
+}
+
+sub checkhup ($$) {
+ my $multiprocessmode = shift;
+ my $gothup = shift;
+ if ($gothup) {
+ if ($multiprocessmode) {
+ do_log("Exiting due to HUP signal.");
+ exit 0;
+ } else {
+ do_log("Restarting due to HUP signal.");
+ return 1;
+ }
+ }
+ return 0;
}
+sub reload_cfg ($) {
+ my $cfgfile = shift;
+ my ($oldcfg, $oldprobes) = ($cfg, $probes);
+ do_log("Reloading configuration.");
+ eval { load_cfg($cfgfile) };
+ if ($@) {
+ do_log("Reloading configuration from $cfgfile failed: $@");
+ ($cfg, $probes) = ($oldcfg, $oldprobes);
+ return 0;
+ }
+ return 1;
+}
+
+
sub gen_imgs ($){
my $cfg = shift;