#!/usr/bin/perl
#
# The CLX watchdog - now evaluating the status file
#
# Last change: Wed Feb 17 18:13:12 GMT 1999

require 'getopts.pl';
require 'ctime.pl';

$home = (getpwnam('clx_us'))[7];
$pg_home = (getpwnam('postgres'))[7];
$status = "$home/log/clx_stat";

$lock = "/tmp/clx_watchdog.lock";
$clx_lock = "$home/log/clx_lock";
$pg_vlock = "$pg_home/data/base/clx_db/pg_vlock";
$wall = "/usr/bin/wall";

$logfile = "$home/log/crash.log";
$snapshot = 0;
$verbose = 0;
$force = 0;
$testmode = 0;
$quiet = 0;
$limit = 300; # 300 seconds max without a sign of life
$ENV{"TZ"} = "GMT"; # this is for syslog to make it always log in UTC
$prog = join('/',(split(/\//,$0))[-2,-1]);
print "$prog\n" if ( $debug );

&Getopts('shl:vftq');

############# Some control functions for the watchdog #############
#
# These functions are accessed by calling this script by different
# names. "$prog" contains the name under which it was called (acutally
# only directory and filename. This script is linked to
# exec/privileg/show/watchdog and other such locations.
#

if (length($opt_q) > 0) {
	$quiet = 1;
}

if ( $prog eq "show/watchdog" || ( $ARGV[0] eq "show" ) ) {
        &showstatus;
        exit;
}

if ( $prog eq "set/watchdog" || ( $ARGV[0] eq "on") ) {
        &watchdog_on;
        exit;
}

if ( $prog eq "set/nowatchdog" || ( $ARGV[0] eq "off") ) {
        &watchdog_off;
        exit;
}

if (length($opt_h) > 0) {
print <<NNNN;
CLX Watchdog - watches the status file $status.\n

Usage: clx_watchdog [-fhsvt] [-l <seconds>] [on|off|show]

    -f  force: use clx -x to shut down CLX
    -h	display this help
    -s	snapshot mode - tries to turn on debug logging for 
        a short period before CLX is shut down and restarted 
        (not yet operational).
    -v  be verbose (for debugging)
    -t  test only, do not really shut down CLX
    -l  delay in seconds (default 300 seconds)

    on  turn on watch-dogging
   off  turn off watch-dogging
  show  show current status

These three commands can only work when clx_watchdog is called by root's
crontab regularly (10 minutes recommended).
NNNN
exit(1);
}

############# This is the real watchdog #############

die "You must be root to run $0.\n" if ($< != 0);

if (length($opt_l) > 0) {
	$limit = $opt_l;
	print "Limit: $limit\n";
}

if (length($opt_t) > 0) {
	$testmode = 1;
}

if (length($opt_s) > 0) {
	$snapshot = 1;
        print "Snapshot will be made.\n";
}

if (length($opt_v) > 0) {
	$verbose = 1;
}

if (length($opt_f) > 0) {
	$force = 1;
}

if ( -e $pg_vlock ) {
		sys_log("clx_watchdog: VACUUM is running - exiting.");
		exit 0;
}

if ( -e $clx_lock ) {
		sys_log("clx_watchdog: Maintenance lock file $clx_lock found - exiting.");
		$msg = ">>> CLX watchdog is currently disabled, remember to enable it again later.";
		open(WALL,"| $wall");
		print WALL $msg;
		close(WALL);
		exit 0;
}


if ( -e $lock ) {
	chop($pid=`cat $lock`);

	($filedate) =  (stat($lock))[9];
	$now = time();
	$diff = $now - $filedate;

	if ( -d "/proc/$pid" && $diff < $limit ) {

		sys_log("clx_watchdog: Another watchdog is already running from $diff seconds ago - exiting.");
		exit 0;
	} else {

		kill(9,$pid);
		sys_log("clx_watchdog: Removing stale lockfile $lock (PID=$pid)");
		unlink $lock || die "Cannot unlink $lock.\n";
		exec($0,'-f');
	}
} else {

	open(LCK,"> $lock") || die "Cannot open lock file $lock.\n";
	print LCK "$$\n";
	close(LCK);
}

if ( ! -e $status ) {
	sys_log("clx_watchdog: $status does not exist.");
	sys_log("clx_watchdog: CLX was probably shut down manually.");
#	exit 0;
}

($size,$filedate) =  (stat($status))[7,9];
$now = time();

$diff = $now - $filedate;
$restart = 0;
if ( $diff > $limit ) {

	sys_log("clx_watchdog: CLX is dead - probably clx_ctl has died.");
        $restart = 1;
}

if ( $size ne 0) {

	sys_log("clx_watchdog: Some CLX processes have died:");
        open(IN,$status);
        $which = <IN>;
        close(IN);
	sys_log("clx_watchdog: $which");
        $restart = 1;

}

if ( $restart ) {


	if ( ! $testmode ) {

		&snapshot if ( $snapshot );
		if ( ! $force ) {
			sys_log("clx_watchdog: doing clx -s");
			system("/bin/su","-","clx_us","-c","$home/bin/clx -s");
		}
		sys_log("clx_watchdog: running ~clx_us/tools/startup");
		system("$home/tools/startup");

	} else {

		sys_log("clx_watchdog: testmode - SHUT DOWN.");
	}

} else {

	sys_log("clx_watchdog: last timestamp $diff seconds ago - OK.");

}

unlink $lock;

#
# Eintrag in System-Log
#
sub sys_log {
	local($msg) = @_;
	system("/usr/bin/logger -p local5.info \"$msg\"");
	system("/usr/bin/logger -p local5.debug \"$msg\"");
}

sub snapshot {
	print "Snapshot\n";
	open(LOG,"> $logfile") || die "Cannot open log file $logfile.\n";
	open(PS,"/bin/ps auxw |");
	while(<PS>) { print LOG $_; }
	close(PS);
	close(LOG);
}

sub showstatus() {
	return if ($quiet);
        if (-r $lock) {
                $filedate =  &ctime((stat($lock))[9]);
                chop($filedate);
                printf("CLX watchdog is disabled since %s.\n",$filedate);
        } else {
                print "CLX watchdog is enabled.\n";
        }
}

sub watchdog_off {
	if ( ! -e $lock ) {
		print "Creating lockfile $lock\n" if ( ! $quiet);
	        open(LCK,"> $lock");
		print LCK "$$\n";
		close(LCK);
	} else {
		print "Lockfile $lock already exists.\n" if ( ! $quiet);
	}
        &showstatus;
}

sub watchdog_on {
	print "Removing lockfile $lock\n" if ( -e $lock && ! $quiet);
        unlink($lock);
        &showstatus;
}
