#! /usr/bin/env bash
#
# Cleanup tasks after Zeek termination:  move the node's working directory
# to a tmp dir and create a new working directory, create a crash report if
# the node crashed, wait for this node's archive-log processes to finish,
# try to archive any remaining logs (and send an email if this fails), and
# finally (if the node didn't crash) remove the tmp dir if all logs were
# successfully archived.
#
# post-terminate <type> <dir> [<crashflag>]
#
# <type> is the node's type ("manager", "worker", etc.).
# <dir> is the node's working directory.
#
# If <crashflag> is not set, then ZeekControl has stopped Zeek normally.
# If <crashflag> is "crash", then ZeekControl has determined that Zeek crashed
# and this script will return information about the crash on stdout which is
# suitable for mailing to the user.  If <crashflag> is "killed", then
# ZeekControl terminated Zeek forcefully (but intentionally) by SIGKILL while
# trying to stop Zeek.

sendfailuremail()
{
    if [ "${mailarchivelogfail}" = "0" ]; then
        return
    fi

    $scriptdir/send-mail "archive log failure on node $nodename" <<_EOF_
Unable to archive one or more logs in directory:
${postdir}
Check the post-terminate.out file in that directory for any error messages.
_EOF_
}

if [ $# -lt 2 ] || [ $# -gt 3 ]; then
    echo "post-terminate: wrong usage: $@"
    exit 1
fi

nodetype=$1
dir=$2
nodename=`basename $dir`

if [ ! -d "$dir" ]; then
    echo "post-terminate: directory not found: $dir"
    exit 1
fi

crash=0
killed=0
if [ "$3" = "crash" ]; then
    crash=1
elif [ "$3" = "killed" ]; then
    killed=1
fi

scriptdir=`dirname $0`
. $scriptdir/zeekctl-config.sh

if [ -z "${tmpdir}" ]; then
    echo "post-terminate: zeekctl option tmpdir not set"
    exit 1
fi

if [ ! -d "${tmpdir}" ]; then
    mkdir "${tmpdir}"
fi

tmpdirtimestamp=`date +%Y-%m-%d-%H-%M-%S`
postterminatetime=`date +%y-%m-%d_%H.%M.%S`

postdir=${tmpdir}/post-terminate-$nodetype-$tmpdirtimestamp-$$

if [ $crash -eq 1 ]; then
    postdir=$postdir-crash
fi

mv "$dir" "$postdir"
if [ $? -ne 0 ]; then
    exit 1
fi

mkdir "$dir"

cd "$postdir"

if [ -d .state ]; then
    mv .state "$dir"
fi

if [ $crash -eq 1 ]; then
    # Output the crash report and save it to disk in case the user doesn't
    # receive the email.
    "${scriptsdir}"/crash-diag -c "$postdir" > .crash-diag.out
    cat .crash-diag.out
fi

if [ ! -f .startup ]; then
    echo "post-terminate: file not found: .startup"
    exit 1
fi

wait_for_archivelog()
{
    # Gather list of all archive-log PID files.
    pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')

    # Wait for any archive-log processes to finish, so that we can either
    # launch new ones (below) or remove this directory.
    while [ -n "$pidfiles" ]; do
        for pfile in $pidfiles ; do
            # If PID file is empty, then check it again later.
            if [ -s $pfile ]; then
                # Check if a process with given PID exists
                ps -p $(cat $pfile) > /dev/null 2>&1
                if [ $? -ne 0 ]; then
                    # No such process exists, so remove PID file
                    rm -f $pfile
                fi
            fi
        done

        sleep 1

        pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')
    done
}

parse_filename()
{
    filename=$1

    # Try to extract a timestamp from the filename, and adjust the base name
    # accordingly.  If the filename doesn't contain any recognized timestamp
    # format, then just assume the whole thing is the base name.

    # Try to remove suffix ".YYYY-MM-DD-HH-MM-SS" (this format is specified in
    # Log::default_rotation_date_format and is used by the ascii writer script
    # to rename a log immediately after Zeek rotates it).
    tmp=`echo $filename | sed 's/[.][1-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]-[0-2][0-9]-[0-5][0-9]-[0-5][0-9]$//'`
    if [ "$filename" != "$tmp" ]; then
        basename=$tmp
        # Remove the base name and '.' to get the timestamp.
        ts=${filename#$basename.}
        # Convert time from YYYY-MM-DD-HH-MM-SS to YY-MM-DD_HH.MM.SS
        strt=`echo $ts | awk -F '-' '{ printf("%s-%s-%s_%s.%s.%s",substr($1,3,2),$2,$3,$4,$5,$6) }'`
    else
        # Try to remove suffix "-YY-MM-DD_HH.MM.SS" (this format is hard-coded
        # in Zeek, and is the format used by Zeek when a log is rotated).
        tmp=`echo $filename | sed 's/-[0-9][0-9]-[0-1][0-9]-[0-3][0-9]_[0-2][0-9][.][0-5][0-9][.][0-5][0-9]$//'`
        if [ "$filename" != "$tmp" ]; then
            basename=$tmp
            # Remove the base name and '-' to get the timestamp.
            strt=${filename#$basename-}
        fi
    fi
}

archivelogs()
{
    startuptime=`cat .startup | tail -1`

    # Attempt to archive all log files.  Although stdout.log/stderr.log are
    # not really Zeek logs, we try to archive them anyway, because they might
    # contain useful info, especially if Zeek crashes.

    # If there's a .log_suffix file, set ZEEK_ARG_LOG_SUFFIX so that
    # archive-log and make-archive-name know about it.
    if [ -f .log_suffix ]; then
        export ZEEK_ARG_LOG_SUFFIX=$(cat .log_suffix)
    fi

    for logname in *.log; do
        # Get the base name (such as "conn") by removing the file extension.
        basename=`basename "$logname" .log`

        # Start time of log.
        strt=

        # If the filename contains a timestamp (i.e., a log that was rotated
        # but not archived), then try to get the start time from the log
        # filename.  If a timestamp is found, then the base name is also
        # updated to not include the timestamp.
        parse_filename "$basename"

        # Assume the end time of the log is the time this script is run,
        # because Zeek stopped running before this script started.
        end=$postterminatetime

        if [ -z "$strt" ]; then
            # We couldn't extract the start time from the log filename, likely
            # because it isn't there (or possibly it's in an unrecognized
            # format).
            strt=$startuptime
            if [ -f .rotated.$basename ]; then
                # The time obtained here is always >= the startup time of Zeek,
                # so it's usually a more accurate guess of this log's start
                # time.
                strt=`cat .rotated.$basename`

                # However, if archive-log archived a log with the same base
                # name as this log, and if it did so after this script started,
                # then the start time that we computed will be later than the
                # end time.  If so, then reset the start time to equal the end
                # time.
                expr "$strt" ">" "$end" >/dev/null
                if [ $? -eq 0 ]; then
                    strt=$end
                fi
            fi
        fi

        # Note: here we assume the log writer type is "ascii"
        "${scriptsdir}"/archive-log $logname $basename $strt $end 1 ascii
        if [ $? -ne 0 ]; then
            failed=1
        fi
    done
}

postterminate()
{
    # Wait until all running archive-log processes have terminated.
    wait_for_archivelog

    failed=0

    # Archive all logs.
    archivelogs

    # If one or more logs failed to be archived, then try to send an email.
    if [ $failed -ne 0 ]; then
        sendfailuremail
    fi

    # If Zeek crashed, then we don't need to do anything else, because we don't
    # want to remove the directory.
    if [ $crash -eq 1 ]; then
        exit 0
    fi

    # If no archive-log processes started from this script failed, then remove
    # the directory.  If the directory is not removed, then an email was sent
    # to notify the user to look in this directory for logs.
    if [ $failed -eq 0 ]; then
        rm -rf "$postdir"
    fi
}

# Execute the remaining part of this script in the background so that zeekctl
# doesn't need to wait for it to finish.  Stdout/stderr is redirected to a
# file to capture error messages.
postterminate >post-terminate.out 2>&1 &

# In some situations (such as testing), we may want the zeekctl stop command to
# wait for the post-terminate script to finish.
if [ "${stopwait}" = "1" ]; then
    wait
fi
