Skip to content

Commit

Permalink
rework node control logging
Browse files Browse the repository at this point in the history
  • Loading branch information
sni committed Sep 24, 2024
1 parent f543705 commit 4a75882
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 183 deletions.
186 changes: 158 additions & 28 deletions lib/Thruk/Utils/External.pm
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ sub cmd {
my($is_parent, $parent_res) = _fork_twice($c, $id, $dir, $conf);
return $parent_res if $is_parent;

do_child_stuff($c, $dir, $id);
_init_child_process($c, $dir, $id, $conf);
$cmd = $cmd.'; echo $? > '.$dir."/rc" unless $conf->{'no_shell'};
exec($cmd) or exit(1); # just to be sure
}
Expand Down Expand Up @@ -136,7 +136,7 @@ sub perl {

my $err;
eval {
do_child_stuff($c, $dir, $id);
_init_child_process($c, $dir, $id, $conf);
$c->stats->profile(begin => 'External::perl');

do {
Expand All @@ -163,6 +163,7 @@ sub perl {
Thruk::Utils::IO::write($dir."/rc", ($rc ? 0 : 1));
Thruk::Utils::IO::write($dir."/perl_res", (defined $rc && ref $rc eq '') ? Thruk::Utils::Encode::encode_utf8($rc) : "", undef, 1);

wrap_prefix_output_stop();
open(*STDERR, '>>', '/dev/null') or _warn("cannot redirect stderr to /dev/null");
open(*STDOUT, '>>', '/dev/null') or _warn("cannot redirect stdout to /dev/null");
};
Expand Down Expand Up @@ -201,7 +202,11 @@ sub perl {
$err = $@ unless $err;
$c->stats->profile(end => 'External::perl');
save_profile($c, $dir);
_save_log_archive($c, $dir, $conf->{'log_archive'}) if $conf->{'log_archive'};
wrap_prefix_output_stop();
if($conf->{'log_archive'}) {
unlink($dir."/stdout");
Thruk::Utils::IO::write($dir."/stdout", Thruk::Utils::IO::read($conf->{'log_archive'}));
}
if($err) {
eval {
Thruk::Utils::IO::write($dir."/stderr", "ERROR: perl eval failed:\n".$err, undef, 1);
Expand Down Expand Up @@ -702,6 +707,52 @@ sub wait_for_job {
return $is_running;
}

##########################################################

=head2 wait_for_peer_job
wait_for_peer_job($c, $peer, $jobid, [$poll_interval = 3], [$max_wait = 60], [$print = 0])
waits $max_wait seconds for job on peer to complete and returns the job data.
=cut
sub wait_for_peer_job {
my($c, $peer, $job, $poll_interval, $max_wait, $print) = @_;
$max_wait = 60 unless $max_wait;
$poll_interval = 3 unless $poll_interval;
my $end = time() + $max_wait;
my $jobdata;

my($already_out, $already_err) = (0, 0);
my $print_remaining = sub {
return unless $jobdata;
return unless $print;
if($jobdata->{'stdout'}) {
my $out = substr($jobdata->{'stdout'}, $already_out);
print $out;
$already_out += length($out);
}
if($jobdata->{'stderr'}) {
my $out = substr($jobdata->{'stderr'}, $already_err);
print $out;
$already_err += length($out);
}
};

while(time() < $end) {
eval {
$jobdata = $peer->job_data($c, $job);
};
if($jobdata && !$jobdata->{'is_running'}) {
last;
}
&{$print_remaining}();
sleep($poll_interval);
}
&{$print_remaining}();
return($jobdata);
}

##############################################

=head2 update_status
Expand Down Expand Up @@ -779,28 +830,9 @@ sub save_profile {
}

##############################################
# save stdout and stderr into new logfile
sub _save_log_archive {
my($c, $job_dir, $logfile) = @_;

my $out = Thruk::Utils::IO::saferead($job_dir.'/stdout') // '';
my $err = Thruk::Utils::IO::saferead($job_dir.'/stderr') // '';
Thruk::Utils::IO::write($logfile, $out.$err);

return;
}

##############################################

=head2 do_child_stuff
do_child_stuff($c, $dir, $id)
do all child things after a fork
=cut
sub do_child_stuff {
my($c, $dir, $id, $keep_stdout_err) = @_;
# initialize children process
sub _init_child_process {
my($c, $dir, $id, $conf) = @_;

confess("no c") unless $c;

Expand Down Expand Up @@ -838,14 +870,19 @@ sub do_child_stuff {
$c->{'app'}->{'pool'}->shutdown_threads() if $c->{'app'}->{'pool'};

# now make sure stdout and stderr point to somewhere, otherwise we get sigpipes pretty soon
unless($keep_stdout_err) {
if($conf->{'log_archive'}) {
unlink($dir."/stdout");
symlink($conf->{'log_archive'}, $dir."/stdout");
wrap_prefix_output($conf->{'log_archive'});
} else {
wrap_prefix_output_stop();
my $fallback_log = '/dev/null';
$fallback_log = $c->config->{'log4perl_logfile_in_use'} if $c->config->{'log4perl_logfile_in_use'};
$fallback_log = $ENV{'OMD_ROOT'}.'/var/log/thruk.log' if $ENV{'OMD_ROOT'};
$fallback_log = $dir."/stderr" if $dir;
open(*STDERR, ">>", $fallback_log) || die "can't reopen stderr to $fallback_log: $!";
$fallback_log = $dir."/stdout" if $dir;
open(*STDOUT, ">>", $fallback_log) || die "can't reopen stdout to $fallback_log: $!";
$fallback_log = $dir."/stderr" if $dir;
open(*STDERR, ">>", $fallback_log) || die "can't reopen stderr to $fallback_log: $!";
}

## no critic
Expand Down Expand Up @@ -1233,4 +1270,97 @@ sub _reap_pending_childs {

##############################################

=head2 wrap_prefix_output
wrap stdout/stderr and add timestamp prefix.
=cut
sub wrap_prefix_output {
my($target_file) = @_;

open(my $fh, '>', $target_file) || die("cannot open log file: $target_file: $!");

## no critic
tie *STDOUT, 'Thruk::Utils::External', ($fh);
tie *STDERR, 'Thruk::Utils::External', ($fh);
## use critic

STDOUT->autoflush(1);
STDERR->autoflush(1);

return;
}

##############################################

=head2 wrap_prefix_output_stop
stop wrapping stdout/stderr
=cut
sub wrap_prefix_output_stop {
untie *STDOUT;
untie *STDERR;

return;
}

##############################################
sub TIEHANDLE {
my($class, $fh) = @_;
my $self = {
fh => $fh,
newline => 1,
};
bless $self, $class;
return($self);
}

##############################################
sub BINMODE {
my($self, $mode) = @_;
return binmode $self->{'fh'}, $mode;
}

##############################################
sub PRINTF {
my($self, $fmt, @data) = @_;
return($self->PRINT(sprintf($fmt, @data)));
}

##############################################
sub PRINT {
my($self, @data) = @_;

my $last_newline = $self->{'newline'};
$self->{'newline'} = (join("", @data) =~ m/\n$/mx) ? 1 : 0;

my $fh = $self->{'fh'};
if(!$last_newline && !$self->{'newline'}) {
# continue printing
CORE::print($fh @data);
}
elsif(!$self->{'newline'}) {
for my $msg (@data) {
CORE::print($fh Thruk::Utils::Log::time_prefix(), $msg);
}
}
else {
for my $msg (@data) {
for my $l (split/\n/mx, $msg) {
CORE::print($fh Thruk::Utils::Log::time_prefix(), $l, "\n");
}
}
}
return;
}

##############################################
sub CLOSE {
my($self) = @_;
return CORE::close($self->{'fh'});
}

##############################################

1;
70 changes: 33 additions & 37 deletions lib/Thruk/Utils/LMD.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use warnings;
use strict;
use Carp qw/confess/;
use File::Copy qw/copy move/;
use POSIX ();
use Time::HiRes ();

use Thruk::Timer qw/timing_breakpoint/;
Expand Down Expand Up @@ -365,53 +364,50 @@ sub kill_if_not_responding {
my $ctime = (stat($pid_file))[10];
return if($ctime > time() - 120);

my $data;
my $pid = fork();
if($pid == -1) { die("fork failed: $!"); }
# kill in background job
my $job = Thruk::Utils::External::perl($c, {
expr => 'Thruk::Utils::LMD::_kill_step2($c, '.$lmd_timeout.', '.$lmd_pid.')',
message => 'killing lmd',
timeout => $lmd_timeout+2,
background => 1,
});

if(!$pid) {
require Thruk::Utils::External;
Thruk::Utils::External::do_child_stuff($c, 0, 0);
alarm($lmd_timeout);
eval {
$data = $c->db->lmd_peer->_raw_query("GET sites\n");
};
my $err = $@;
alarm(0);
if($err) {
_error("lmd not responding, killing with force: err - ".$err);
_error($data);
kill('USR1', $lmd_pid);
sleep(1);
kill(2, $lmd_pid);
sleep(1);
kill(9, $lmd_pid);
}
exit 0;
}
my $extra = 5;
my $is_running = Thruk::Utils::External::wait_for_job($c, $job, $lmd_timeout+$extra);

my $waited = 0;
my $extra = 5;
my $rc = -1;
while($waited++ <= ($lmd_timeout+$extra) && $rc != 0) {
POSIX::waitpid($pid, POSIX::WNOHANG);
$rc = $?;
if($is_running) {
_error("lmd not responding, killing pid $lmd_pid");
kill('USR1', $lmd_pid);
kill(2, $lmd_pid);
sleep(1);
kill(9, $lmd_pid);
}
if($rc != 0) {
_error("lmd not responding, killing with force: rc - ".$rc." - ".($! || ""));

return;
}

########################################
sub _kill_step2 {
my($c, $lmd_timeout, $lmd_pid) = @_;

my $data;
alarm($lmd_timeout);
eval {
$data = $c->db->lmd_peer->_raw_query("GET sites\n");
};
my $err = $@;
alarm(0);
if($err) {
_error("lmd not responding, killing with force: err - ".$err);
_error($data);
kill('USR1', $lmd_pid);
kill(2, $pid);
sleep(1);
kill(2, $lmd_pid);
sleep(1);
kill(9, $lmd_pid);
kill(9, $pid);
sleep(1);
POSIX::waitpid(-1, POSIX::WNOHANG);
}

return;
return(1);
}

########################################
Expand Down
6 changes: 3 additions & 3 deletions lib/Thruk/Utils/Log.pm
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ sub time_prefix {
sub wrap_stdout2log {
my($capture, $tmp);
## no critic
open($capture, '>', \$tmp) or die("cannot open stdout capture: $!");
open($capture, '>', \$tmp) || die("cannot open stdout capture: $!");
tie *$capture, 'Thruk::Utils::Log', (*STDOUT);
select $capture;
STDOUT->autoflush(1);
Expand Down Expand Up @@ -351,7 +351,7 @@ sub wrap_stdout2log_stop {
sub wrap_stderr2log {
my($capture, $tmp);
## no critic
open($capture, '>', \$tmp) or die("cannot open stdout capture: $!");
open($capture, '>', \$tmp) || die("cannot open stdout capture: $!");
tie *$capture, 'Thruk::Utils::Log', (*STDERR);
select $capture;
STDERR->autoflush(1);
Expand All @@ -368,7 +368,7 @@ sub wrap_stderr2log {
=cut
sub wrap_stderr2log_stop {
## no critic
select *STERR;
select *STDERR;
## use critic
return;
}
Expand Down
Loading

0 comments on commit 4a75882

Please sign in to comment.