#!/usr/bin/perl -w
#
# monitor_file
#
# AUTHOR:
#   Dan Harkless                                                    <opensource@
#   http://harkless.org/dan/software/                              harkless.org>
# 
# COPYRIGHT:
#   This file is Copyright (C) 2008 by Dan Harkless, and is released under the
#   GNU General Public License <http://www.gnu.org/copyleft/gpl.html>.
#
# USAGE:
#   % monitor_file -m <monitored_file> 
#   [-d <diff_opts>] [-e <email>] [-p <prev_state_dir>] [-P] 
#   [-o <original_regexp> -r <replacement_string>]
#
# EXAMPLES:
#   % monitor_file -d "-u -T" -m myfile -P
#   % monitor_file -e me@domain.tld -m /proc/mdstat -p ~/logs -o '^.*?(recovery|
#   resync).*' -r '[...] $1 [...]'
#
# DESCRIPTION:
#   Monitors a file (or pseudo-file, like a /proc entry) to watch for changes.
#   If there are any, 'diff' output will be displayed, or sent to the email 
#   address specified.
#
#   monitor_file only checks the file once per invocation, then exits -- it's
#   best called from cron.  The way it determines if the file has changed since
#   last time is by comparing it to a saved "previous state" file.  If there's a
#   difference, then the 'diff' output will be presented, and then the current
#   file contents will be written to the previous state file.  With this system,
#   you'll only get bugged once each time a file changes (rather than being
#   bugged every time a cron check fires off and a file is not matching a static
#   "expected state" file), and the previous state will be remembered even
#   across reboots.
#   
#   Note that since we run the 'diff' command rather than checking for
#   differences manually within the script (to get the benefit of the nice 'diff
#   -u' output), there's a short window in between diffing and saving the
#   previous state file during which the contents of the monitored file could
#   change from what 'diff' saw.  Also note that when monitor_file creates the
#   previous state file (it does this if it doesn't exist already, like on the
#   first time it's called on a particular file), it doesn't try to duplicate
#   permissions of the monitored file -- permissions will be according to the
#   active umask.
#
#   If monitor_file encounters a difference or error, it exits with status 1.
#
#   -d, if specified, is a string of options to pass to the diff command.  If
#   not specified, it defaults to -u.  If you need to pass multiple options,
#   enclose the list of options in quotes so the shell passes them as a single
#   argument to monitor_file.
#
#   -e, if specified, will cause any file differences (or errors) to be sent to
#   the email address specified rather than printed on stderr.  You may think
#   that this option is only useful if emailing a remote account, since you get
#   emailing to a local account for free when a script called from cron has
#   output, but you may want to use it with local accounts as well, since the
#   email's Subject will be more succinct.  If you use this option, the
#   MailTools module collection is required; it's available from CPAN.
#
#   -m is required, and specifies the file to be monitored.
#
#   -o is optional and specifies an "original regular expression" to be replaced
#   by the "replacement string" specified by -r.  This is useful if you need to
#   filter the contents of the monitored file to exclude differences you don't
#   want reported.  For instance, I run a cron job to check /proc/mdstat once a
#   minute (as an alternative to running mdadm in daemon mode and having to deal
#   with its limited event emailing capability or else writing a custom emailing
#   script for it), but if a resync is occurring, I only want to be notified
#   when it starts and when it finishes -- I don't want an email shot at me
#   every minute reporting the resync progress.  Therefore, I use -o and -r
#   options like the ones in the EXAMPLES section above.  Note that, as shown in
#   that example, you can use substitution variables like '$1' in your
#   replacement string (thus if you want to use a literal '$', be sure to
#   backslash it).  Also note that using -o / -r will cause a
#   <monitored_file>.filt file to be created (and we do not delete it).
#
#   -p is optional, and specifies a "previous state directory".  Ordinarily,
#   .filt, .prev, and .prev.prev files are saved in the same directory as the
#   monitored file.  However, sometimes this is undesirable or impossible, e.g.
#   when monitoring a /proc pseudo-file.  In cases like these, you can specify a
#   directory to write these files to with -p.  Note that if -m specifies a file
#   using a relative path, this path will still be interpreted relative to the
#   working directory at the time monitor_file was called, not the -p directory.
#
#   -P (capital p), if specified, will back up <file>.prev to <file>.prev.prev
#   prior to wiping it out, so that you'll be able to check on the state of the
#   monitored file as of the last time monitor_file was run, AND as of the time
#   before last.
#
#   -r specifies a replacement string.  See -o for more information.
#
# DATE        MODIFICATION
# ==========  ==================================================================
# 2008-09-02  "use English qw(-no_match_vars)": avoid regex performance penalty.
# 2008-06-16  Changed the -o / -r replacement code from 's/$opt_o/$opt_r/g' to
#             's/$opt_o/eval qq{"$opt_r"}/eg' so that substitution variables 
#             like '$1' can be used in the replacement string.
# 2007-12-23  Added -o and -r options to allow excluding uninteresting 
#             differences.  Changed -p to specify a previous state directory
#             (which we chdir() to) rather than a previous state file.  -P was
#             unnecessarily copying .prev to .prev.prev on every invocation,
#             rather than only when there was a change.  Documented the mild
#             race condition in between running 'diff' and saving the state of
#             the monitored file.  Changed the message when we're creating the
#             .prev file for the first time to include the file contents (so
#             that when an email is being sent, it can be saved as a record of
#             what the normal expected state of the file is supposed to be, and
#             unified and context diffs can be cross-referenced to it).
# 2006-08-12  Added -P option to save the old .prev file as .prev.prev.
# 2006-08-12  Added -d option to allow passing options to diff.
# 2006-08-12  My original use for this script was to monitor RAID status /proc
#             files, so it was sufficient to output (both versions of) the first
#             differing line (this was also so the email would be as succint as
#             possible and SMS versions sent by my sms_biff script would be
#             meaningful).  Later I started using it on files like nslookup
#             output where the first differing line wasn't necessarily enough,
#             so I added -w to also output the entire contents of the old new
#             versions of the file.  However, I'm now also using it on long
#             files like web pages with differences potentially in multiple
#             places, so we really want proper diff output.  We could use CPAN
#             modules to diff our own slurped file contents buffers, and this
#             would retain the advantage of there not being a race condition in
#             between diffing and saving the previous state file, but since that
#             race condition may never matter, and for simplicity, we'll just
#             call out to the 'diff' command, and do a 'cp' immediately
#             afterwards to limit the race window. 
# 2006-08-12  When I first wrote this it was to monitor system files, so 
#             defaulting the previous state file to
#             /var/run/monitor_file<monitored_file> made sense.  However, I'm
#             now mostly using it for monitoring web pages pulled down with curl
#             (and then often mangled to remove non-meaningful changed parts),
#             so the default for the previous state file has been changed to be
#             <monitored_file>.prev.  If you still want to use  
#             /var/run/monitor_file files you'll have to specify them with -p.
# 2004-02-28  Search-and-replace error in my_die() caused it to not work.
# 2003-04-21  If the monitored file and previous state file differ in number of
#             lines, print totals using scalar(@<array>), not $#<array>.  Also,
#             only print an 's' at the end of "line" if the total isn't 1.
# 2002-11-21  Original.


## Modules used ################################################################
use Cwd;                         # for cwd()
use English qw(-no_match_vars);  # allow use of names like @ARG rather than @_
use File::Basename;              # for basename() and dirname()
use Getopt::Std;                 # for getopts()

# Use only while debugging (due to major performance hit):
#use diagnostics;     # turn on -w and output verbose versions of warnings


## Subroutines #################################################################
sub email_or_print {
    if (not $print_fh) {
	if ($opt_e) {
	    require Mail::Send;

	    # TBD: Reassign STDERR to be this handle for Perl errors?  Carp?
	    $email = new Mail::Send;
	    $email->to("$opt_e");
	    $email->subject("$progname: $opt_m changed state!");
	    $print_fh = $email->open();
	}

	if (not $print_fh) {
	    # We could come here if not $opt_e or if $email->open() failed.
	    $print_fh = STDERR;
	    print $print_fh "$progname: $opt_m changed state!\n";
	}
    }

    print $print_fh @ARG;
}


## Main ########################################################################
$progname = basename($PROGRAM_NAME);

# Process commandline arguments.
use vars qw($opt_d $opt_e $opt_m $opt_o $opt_p $opt_P $opt_r $opt_w);
if ((not getopts("d:e:m:o:p:Pr:")) or (not $opt_m) or ($opt_o xor $opt_r)) {
    print STDERR "Usage: $progname -m <monitored_file>\n                    ",
      "[-d <diff_opts>] [-e <email>] [-p <prev_state_dir>] [-P]\n",
      "                    [-o <original_regexp> -r <replacement_string>]\n";
    exit 1;
}

if (not $opt_d) {
    $opt_d = "-u";
}

if ($opt_m !~ m(^/)) {
    $opt_m = cwd() . "/$opt_m";
}

if ($opt_p) {
    if (not chdir($opt_p)) {
	email_or_print "chdir($opt_p): $OS_ERROR.";
	exit 1;
    }

    $base_name = basename($opt_m);
}
else {
    $base_name = $opt_m;
}

$prev_state_file = $base_name . ".prev";

# Check for the previous state file.
if (not -f $prev_state_file) {
    email_or_print "$prev_state_file did not exist.  Creating it";

    if (not open(MONITORED_FILE, $opt_m)) {
	email_or_print "...\n";
	email_or_print "$opt_m: $OS_ERROR.\n";
	exit 1;
    }

    if (not open(PREV_FILE, ">$prev_state_file")) {
	email_or_print "...\n";
	email_or_print "$prev_state_file: $OS_ERROR.\n";
	exit 1;
    }

    email_or_print " with this content:\n";

    while ($monitored_file_line = <MONITORED_FILE>) {
	if ($opt_o) {
	    $monitored_file_line =~ s/$opt_o/eval qq{"$opt_r"}/eg;
	}
	email_or_print $monitored_file_line;
	print PREV_FILE $monitored_file_line;
    }

    exit 1;
}

if ($opt_o) {
    # Need to do filtering before diffing.
    $cur_file = $base_name . ".filt";

    if (not open(MONITORED_FILE, $opt_m)) {
	email_or_print "$opt_m: $OS_ERROR.\n";
	exit 1;
    }

    if (not open(MONITORED_FILE_FILT, ">$cur_file")) {
	email_or_print "$cur_file: $OS_ERROR.\n";
	exit 1;
    }

    while ($monitored_file_line = <MONITORED_FILE>) {
	$monitored_file_line =~ s/$opt_o/eval qq{"$opt_r"}/eg;
	print MONITORED_FILE_FILT $monitored_file_line;	
    }

    close MONITORED_FILE_FILT;
}
else {
    # Simply diff the original files.
    $cur_file = $opt_m;
}

$save_state_command = "cp -pf \Q$cur_file\E \Q$prev_state_file\E 2>&1";

if ($opt_P) {
    $save_state_command 
      = "(mv -f \Q$prev_state_file\E \Q$prev_state_file\E.prev 2>&1; "
	. $save_state_command . ")";
}

# Do the diff.  As documented above, there's a mild race condition here.  The
# file could potentially change in between the call to diff and the call to
# cp/mv.  We could remedy this by using CPAN modules to do the diffing here
# inside the script, or by copying the live file to a temporary file before
# diffing and saving (which might make the diff output sent via email slightly
# confusing, since it wouldn't include the original file path -- this path would
# still be included in the Subject when using -e, though).
$diff_output = `diff $opt_d \Q$prev_state_file\E \Q$cur_file\E 2>&1               || $save_state_command`;

if ($diff_output) {
    email_or_print $diff_output;
    exit 1;
}

# We'll let the potential email file handle close itself (and auto-send).