#!/usr/bin/perl -w
#
# find_correspondents
#
# AUTHOR:
#   Dan Harkless                                                    <opensource@
#   http://harkless.org/dan/software/                              harkless.org>
#
# COPYRIGHT:
#   This file is Copyright (C) 2008 by Dan Harkless, and is released under the
#   GNU General Public License <http://www.gnu.org/copyleft/gpl.html>.
#
# USAGE:
#   % find_correspondents <your_email_address_regexp>
#
# EXAMPLES:
#   % cd ~/Mail
#   % find_correspondents 'dan@speed\.net'
#   % find_correspondents 'dan@speed\.net|nobody@harkless\.org'
#
# DESCRIPTION:
#   If you use MH/nmh or some other mail system that keeps one message per file,
#   named numerically, you can use this script to find everyone who has emailed
#   you at a particular address or addresses.  This is useful, for instance, if
#   you need to inform everyone of a new address.
#
#   cd to your Mail directory first, as find_correspondents does a "find ." 
#   looking for your saved emails.  Give it your email address on the
#   commandline.  This parameter is actually treated as a regexp, so you can
#   separate multiple addresses with '|' (no need for parens around the whole
#   thing as they've already been provided).  And technically, of course, you
#   need to backslash the periods if you want to be sure they can't resolve to
#   some other character besides a literal period.
#
#   The program will output each unique (to, from) address pair where the to
#   address matches your regexp.  The most recent email the program came across
#   with that pair will be given, so you can check it out if you don't recognize
#   the from address.
#
# DATE        MODIFICATION
# ==========  ==================================================================
# 2008-09-02  "use English qw(-no_match_vars)": avoid regex performance penalty.
# 2002-10-15  Original.


## Modules used ################################################################
use English qw(-no_match_vars);  # allow use of names like @ARG rather than @_
use File::Basename;              # for basename()
use File::Find;                  # for find()


## Subroutines #################################################################
sub email_address($) {
    # Also in my email_address and medit scripts -- should be in a module.
    $ARG = shift;
    if (/<([^ >]+)>/) {
	# "Dan Harkless <dan@unitech.com>" style.
	return $1;
    }
    elsif (/\(.*\) *([^ ]+)/) {
	# "(Dan Harkless) dan@unitech.com" style.
	return $1;
    }
    elsif (/([^ ]+) *\(.*\)/) {
	# "dan@unitech.com (Dan Harkless)" style.
	return $1;
    }
    elsif (/ *([^ ]+) */) {
	# Possible whitespace around bare address.
	return $1;
    }
    else {
	# Empty address.
	return $ARG;
    }
}

sub process_file {
    if (-f and /^[1-9]/) {
	# This is an [n]mh mail file.
	$to = "";
	$from = "";
	open MAIL, $ARG;
	while (<MAIL>) {
	    # TBD: Also look for cases where we sent an email _from_ one of
	    #      $my_addresses _to_ someone, but they never wrote back.  If we
	    #      do so, we'll need to be able to parse through multiple
	    #      comma-separated recipients (and be able to distinguish
	    #      between commas inside and outside quotes) on the To: and/or
	    #      Cc: lines.
	    if (/^(To|Cc): .*($my_addresses)/io) {
		$to = lc($2);
	    }
	    elsif (/^From:[ \t]+(.*)/i) {
		$from = lc(email_address($1));
	    }
	    elsif (/^$/) {
		last;  # end of mail header
	    }
	}
	close MAIL;
	if ($to and $from) {
	    $addresses{"to=$to, from=$from"} = $File::Find::name;
	}
    }
}


## Main ########################################################################
$progname = basename($PROGRAM_NAME);

if (@ARGV != 1) {
    print STDERR "Usage: $progname <your_email_address_regexp>\n";
    exit 1;
}

$my_addresses = shift;

find(\&process_file, ".");

foreach $key (sort keys %addresses) {
    print "$key, example=$addresses{$key}\n";
}