#!/usr/bin/perl
#
# site_spider
#
# AUTHOR:
#   Dan Harkless                                                    <opensource@
#   https://harkless.org/dan/software/                             harkless.org>
#
# COPYRIGHT:
#   This file is Copyright (C) 2025 by Dan Harkless, and is released under the
#   GNU General Public License <https://www.gnu.org/copyleft/gpl.html>.
#
# USAGE:
#   % site_spider [-1clmnrvV] [-f <cfg>] [-s <m>] [-t <secs>] [-u <ag>] <URL>...
#
# DESCRIPTION:
#   Spiders through a site, given a starting URL. Reports links that don't
#   result in a 200 OK.  Links that are expected to be broken, or to redirect,
#   can be marked as such in a config file, to prevent their reporting.  Can 
#   also generate a site map.
#
# REQUIREMENTS:
#   site_spider requires WWW::Mechanize, available from CPAN.  Note that
#   WWW::Mechanize may require you to upgrade your versions of (or will require
#   you to install, if you don't have them already) HTML::Parser, LWP, and URI.
#
# COMMANDLINE OPTIONS:
#   -1
#     Normally if there's a URL that's linked to from multiple pages on the
#     site, site_spider will visit it multiple times, so that if, say, the link
#     is 404, we can report all the pages where it appears.  If you'd like
#     site_spider to take less time, you can use the -1 option to visit each
#     link only once, rather than visiting the link once for each unique page
#     that links to it.  (After fixing the first page that contained the 404
#     link found with 'site_spider -1', you can either wait until next time
#     site_spider runs to find any others, or proactively search through the
#     rest of your pages with some other tool like grep.)
#
#   -c
#     Output non-200 links without the usual ":" after them, to make it easier
#     to copy & paste site_spider's output into the ~/.site_spider config file
#     (for expected 30x statuses).
#
#   -f <config_file>
#     By default, site_spider will look for its config file in
#     $HOME/.site_spider, if $HOME is set, or else ./.site_spider.  To override
#     this, specify a file with -f.
#
#   -l
#     Visit local URLs on the specified site only -- do not visit offsite URLs.
#
#   -m
#     Try to follow mailto: links.  You'll have to set the SENDMAIL environment
#     variable if your copy of sendmail (or a sendmail-mimicking wrapper) is not
#     in one of the standard locations.	 In my testing WWW::Mechanize always
#     returned status 400, though.
#
#   -n
#     Try to follow news: links.  You may have to create a .libnetrc file (see
#     Net::Config) for this to work.  I have not been able to successfully test
#     WWW::Mechanize's support for this.
#
#   -r
#     Follow redirects.  This means that even if a URL gets a non-200 status --
#     specifically, a 30x status -- this will not cause site_spider to report it
#     unless the potential chain of redirects ends with a non-200 status.  Note
#     that if you always use -r, in the case of redirects that are only in place
#     for awhile, followed later by a 404, you make it harder for yourself to
#     determine where the new home of the page is, since you don't get a
#     heads-up until the chain is broken.  A better practice is to keep track in
#     your .site_spider file of known redirects that you link to, and their
#     expected targets (see the CONFIG FILE section below).
#
#   -s <site_map_file>
#     Create a site map and write it to <site_map_file> (which will be
#     overwritten without warning if it already exists).  Special processing
#     occurs in the footers of pages.  If a line includes the HTML comment
#     fragment "FOOTER -->", we assume we're in the footer.  In there, if a line
#     includes the regular expression /[Cc]reated: +([A-Za-z]+) +(\d+), (\d+)/,
#     That "<Month> <day_of_month>, <year>" date will be included in the 
#     "Created" column of the site map, reformatted to YYYY-MM-DD.  If a footer
#     line has regexp /[Ll]ast.*?modifi(cation|ed): +([A-Za-z]+) +(\d+), (\d+)/,
#     that date will be include for the URL in the site map's "Last Modified"
#     column, again, reformatted to YYYY-MM-DD.
#
#   -t <site_map_template>
#     Must be specified together with -s.  <site_map_template> must be an HTML
#     file with all the content of the site map page except for the actual link
#     rows.  In place of the link rows must be the line "<!-- MAP ROWS -->",
#     which will be replaced.  Also, in the lines following the site map table,
#     any line matching the regexp /^Last modified: / will be replaced by a line
#     with the current date, of the form "Last modified: August 27, 2025\n".
#
#   -T <secs>
#     Timeout for connecting to a link.  Defaults to 180 seconds if not
#     specified. 
#
#   -u <user_agent>
#     Specify the User-Agent header to use.  By default site_spider uses
#     "Mozilla/4.0 (compatible; MSIE 6.0; site_spider)".  This should allow
#     site_spider to check links on crappy sites that don't let you in unless
#     you're running Internet Explorer.  If that User-Agent string isn't working
#     for you, you can specify a different one with -u.
#
#   -v
#     Verbose.  Temporarily prints each URL that's being visited (and then
#     backspaces to print the next one, in-place).  Requires Term::Cap to be
#     installed (available from CPAN).
#
#   -V
#     More verbose.  Permanently prints each URL that's being visited.
#
# CONFIG FILE:
#   The .site_spider file consists of lines with three parameters, separated by
#   spaces or tabs.  The first parameter specifies the URL(s) in question.
#   Lines' actions are determined by the second parameter and any argument
#   (which are case-insensitive):
#
#   EXCLUDE pattern
#     To avoid visiting, printing, or including URLs in the site map, specify 
#     them with "EXCLUDE" as the 1st param., and either a prefix to match, e.g.:
#
#       EXCLUDE PREFIX:https://www.site.tld/do_not_go_here/
#
#     or a regular expression, as in:
#
#       EXCLUDE REGEXP:^https?://(www\.)?site\.tld/do_not_include_files/.*?[^/]$
#
#   EXCLUDE_TITLE <regexp>
#     Pages can also be omitted from the site map (though they do get visited)
#     if their titles match the given regular expression, which must be
#     surrounded by angle brackets.  For example:
#
#       EXCLUDE_TITLE <ERROR[: ]>
#
#   FOLLOW_REDIRECTS (N|Y) pattern
#     This option overrides the setting of the -r commandline option for a given
#     URL or set of URLs matched by a prefix or regular expression.  For
#     example, if you want site_spider to follow redirects on the URL
#     "http://site.tld/" even when -r isn't specified, you would include the
#     following line in your .site_spider file:
#
#       FOLLOW_REDIRECTS Y http://site.tld/
#
#     If you wanted site_spider to avoid following redirects for all URLs on
#     https://www.site.tld/, you could specify the following:
#
#       FOLLOW_REDIRECTS N PREFIX:https://www.site.tld/
#
#     Regular expressions can also be used, as in:
#
#       FOLLOW_REDIRECTS N REGEXP:^https?://(www\.)?site\.tld/
#
#     If multiple FOLLOW_REDIRECTS setting lines can match a given URL, which
#     setting takes precedence is undefined.  
#
#     The 'N' or 'Y' argument to this option can also be specified as lower-case
#     'n' or 'y'.
#
#   LEVEL_ADJUST regexp adjustment
#     I have pages on my site that are generated by scripts using PATH_INFO,
#     which causes them to be indented an extra level due to the extra '/' in
#     the URLs.  This can be corrected by specifying a regexp (note prefixes are
#     not supported on this one; also note that the slashes in the regexp below
#     are directory delimiters, not a /.../ surrounding the regexp as in Perl
#     syntax), plus an adjustment amount, as in:
#
#       LEVEL_ADJUST /image_album\.pd/ -1
#
#   SITE_MAP url
#     When generating a site map, the Last Modified date would normally be
#     pulled from the previous site map during the crawl, meaning the date shown
#     for the site map in the table would be one version out-of-date (while the
#     date in the footer would correctly be for the current iteration).  This
#     can be avoided by specifying the URL of the site map with this option.
#     This will cause that entry in the map to have its Last Modified forced to
#     the current date.  For example:
#
#       SITE_MAP https://www.site.tld/about/site_map/
#
#   TITLE_MANGLE <from_regexp><to_regexp> [starting_level]
#     For use with -s, when constructing a site map.  Each TITLE_MANGLE line can
#     include a from_regexp and a to_regexp to transform it into (using the Perl
#     construct "s/from_regexp/to_regexp/g").  Note that the angle brackets
#     above are not to denote that "from_regexp" and "to_regexp" are variables
#     -- they must be typed explicitly (since the regexps can include unencoded
#     space characters).  If starting_level is specified, the mangling will only
#     be done on pages with that level of nesting in the site map, and above,
#     where 0 denotes the top page's level.  An example would be:
#
#       TITLE_MANGLE <^Dan Harkless' ><> 1
#
#     The titles on my site generally start with "Dan Harkless' ", but repeating
#     that for every link in the site map would be silly.  It's meaningful on
#     the first link, "Dan Harkless' Home Page", though, so we only apply the
#     mangling to level 1 and above.
#
#     I also have pages whose titles start with "Dan Harkless' Concert 
#     Reviews: ".  Because the TITLE_MANGLE lines are processed in the order 
#     they appear, the following line:
#
#       TITLE_MANGLE <^Concert Reviews: ><>
#
#     would need to either come _after_ the one that removes "Dan Harkless' ",
#     or else come first, and include that text, as in:
#
#       TITLE_MANGLE <^Dan Harkless' Concert Reviews: ><>
#       TITLE_MANGLE <^Dan Harkless' ><>
#
#     Replacement regexps are not limited to blank, e.g.:
#
#       TITLE_MANGLE <Program Images><Program>
#
#   url http_response_code [redirect_url]
#     Tells site_spider to expect an HTTP response code other than 200 for the
#     URL specified as the 1st parameter.  The 2nd parameter is what code is
#     expected, e.g. 404 or 301.  In the case of a 3xx redirect response code
#     like 301, a 3rd parameter can optionally be specified with the URL the
#     original URL is expected to redirect to (and site_spider will check that).
#     For example:
#
#       https://site.tld/dir/known_broken.html 404
#       https://site.tld/photos/ 301 https://site.tld/image_album.pd/photos/
#
#   Comments are also possible in site_spider's config file -- any lines where
#   the first non-whitespace character is a '#' will be ignored.  (Comments
#   cannot be appended to lines with active configuration directives.)
#
# EXAMPLE:
#   % site_spider -1lv -s site_map.html -t template.html https://site.tld/
#
# TO DO:
#   Add a "cleanup" option for site_spider to report any URLs in the
#   .site_spider file that were never encountered while spidering the site.
#
#   Add an option to accept and present cookies, to deal with sites like
#   skweezer.com that give you an error response if you don't?
#
#   Add more flexible ways to get the Created and Last Modified dates for URLs?
#
#   Add an option to generate a Google Sitemap.
#
#   Add an option to generate an RSS feed.
#
# DATE	      MODIFICATION
# ==========  ==================================================================
# 2025-10-14  Added SITE_MAP option to prevent the site map link _in_ the site
#             map from having a "Last modified" date one version behind.
# 2025-10-01  My image_album script outputs the HTTP header as early as it can,
#             so that if my code or module code outputs any error messages, they
#             won't cause non-conforming HTTP.  Unfortunately this means that
#             when it's passed a non-existent (e.g. deleted or renamed) album
#             directory, it reports the error with HTTP status 200.  This causes
#             nonexistent image_album URLs to be propagated into the new version
#             of the site map by being seen in the existing version.  Can't
#             EXCLUDE the site map URL, since we want the map link to appear in
#             the map.  Therefore, added the ability to EXCLUDE_TITLE, which
#             takes a regexp that can match "ERROR" pages, for instance.
# 2025-09-04  Added TO DOs for Google Sitemaps and RSS.
# 2025-09-03  Updated documentation header.
# 2025-08-30  Added "<!-- 0000000000 -->"-style row number comments to the start
#             of the link <TD>s, so that a simple alphasort can be done in JS.
# 2025-08-27  Added -t option and completed initial full implementation of -s.
# 2025-08-27  Added LEVEL_ADJUST config file option.
# 2025-08-26  Added EXCLUDE and TITLE_MANGLE config file options.
# 2025-08-25  At some point, $terminal->Tputs('cub1') stopped working to do
#             backspacing past a line-wrap, and now we need to simply use "\b".
# 2025-08-18  WWW::Mechanize->new() now needs autocheck => 0 to not die on 404s.
# 2009-02-27  Original.


## Modules used ################################################################
use English qw(-no_match_vars);  # allow use of names like @ARG rather than @_
use File::Basename;              # for basename()
use Getopt::Std;                 # for getopts()
use POSIX qw(strftime);
use WWW::Mechanize;              # for web-fetching and link-extracting routines

# Output warnings, but not the bogus "Deep recursion" one.  Pre-5.6.0 Perls
# don't understand this syntax, so we use if.pm (available from CPAN) here.
use if $] >= 5.006, warnings;
no  if $] >= 5.006, warnings => 'recursion';

# Use UTF-8 for standard and user-opened filehandles.  May require installing a
# perl-open package, or similar:
use open qw(:std :encoding(UTF-8));

# TBD: Use one of the robots-parsing modules and optionally obey our own robots?


## Subroutines #################################################################
sub a_is_parent_of_b {
    my $a = shift;
    my $b = shift;

    if ($a eq $b) {
	# URL can't be a parent of itself -- use b_is_on_site_a() if you want
	# that case to return true.
	return 0;
    }
    elsif ($a =~ m</$>) {
	# URL $a ends with a slash.  For example:
	#     a_is_parent_of_b("http://cool.tld/", 
        #                      "http://cool.tld/page.html") == 1
	#     a_is_parent_of_b("http://cool.tld/", 
        #                      "http://cool.tld/images/cow.jpg") == 1
	return substr($b, 0, length($a)) eq $a;
    }
    else  {
	# URL $a doesn't end with a slash.  For example:
	#     a_is_parent_of_b("http://lame.tld/home.asp", 
        #                      "http://lame.tld/page.asp") == 0
	#     a_is_parent_of_b("http://lame.tld/home.asp", 
        #                      "http://lame.tld/images/cow.jpg") == 1
	$num_slashes_in_a = ($a =~ tr</><>);
	$num_slashes_in_b = ($b =~ tr</><>);
	$a =~ s</[^/]*$></>;  # remove anything after final directory
	return substr($b, 0, length($a)) eq $a 
	  && $num_slashes_in_a < $num_slashes_in_b;  # 'and' would need return()
    }
}


sub b_is_on_site_a {
    my $a = shift;
    my $b = shift;

    if ($a eq $b) {
	# http://cool.tld/ is on site http://cool.tld/.
	return 1;
    }
    else {
	# http://lame.tld/page.asp is on site http://lame.tld/home.asp.
	$a =~ s</[^/]*$></>;  # remove anything after final directory
	return a_is_parent_of_b($a, $b);
    }
}


sub backspace {
    my $indent_level = shift;
    my $page = shift;

# The following used to be required to get backspacing across a line-wrap to 
# work, but now, using the Tputs() output strings fails to work, and only "\b"
# works.  Not sure what changed, but will just go with it for now.
#
#    if (defined($terminal->Tputs('cub1'))) {
#	# Backspace when in the leftmost column is undefined behavior in
#	# terminfo, so we can't just backspace here.  'cub1' (cursor_left) is
#	# defined for terminals that have the 'bw' (auto_left_margin)
#	# capability, though, so we'll use 'cub1' if it's defined.
#	$backspace_string = $terminal->Tputs('cub1');
#    }
#    else {
#	# TBD: Surely there's something better to do than just backspacing in
#	# this case, which will fail to wrap back around past the leftmost
#	# column on, e.g., Mac OS X 10.5 xterm if its reverseWrap resource isn't
#	# explicitly set to true.
#	$backspace_string = $terminal->Tputs('kb');
#    }
    for ($i = 1; $i <= $indent_level + length($page); $i++) {
#	print $backspace_string;
	print "\b";
    }
    $last_link_had_no_newline = 1;
}

			    
sub my_die {
    print STDERR "$progname: @ARG  Aborting.\n";
    exit 1;
}


sub indent {
    my $indent_level = shift;

    for ($i = 1; $i <= $indent_level; $i++) {
	print " ";
    }
}

			    
sub number_of_slashes {
    my $url = shift;

    return $url =~ tr</><>;
}

			    
sub visit_link {
    my $page = shift;
    my $parent = shift;
    my $indent_level = 0;

    if (!visited($page, $parent)) {
	# TBD: Have 3rd level of verbosity outputting matching PREFIX / REGEXP?
	foreach $prefix_setting (@exclude_prefix) {
	    if (substr($page, 0, length($prefix_setting)) eq $prefix_setting) {
		if ($opt_V) {
		    print "$page: PREFIX-excluded.\n";
		}
		return;
	    }
	}
	foreach $regexp_setting (@exclude_regexp) {
	    if ($page =~ /$regexp_setting/) {
		if ($opt_V) {
		    print "$page: REGEXP-excluded.\n";
		}
		return;
	    }
	}

	visited_set($page, $parent, 1);

	if ($page =~ m<^([^:]+):>) {
	    $protocol = $1;

	    if ($mech->is_protocol_supported($protocol)) {
		if ($protocol eq "mailto" and !$opt_m) {
		    if ($opt_V) {
			# $indent_level calculation explained below. 
			$indent_level = ($parent =~ tr</><>) 
					+ ($parent =~ m</$> ? 0 : 1)
					- $base_indent_level + 1;
			indent($indent_level);
			print "$page: Skipping since -m was not specified.\n";
		    }
		}
		elsif ($protocol eq "news" and !$opt_n) {
		    if ($opt_V) {
			# $indent_level calculation explained below. 
			$indent_level = ($parent =~ tr</><>) 
			                + ($parent =~ m</$> ? 0 : 1)
					- $base_indent_level + 1;
			indent($indent_level);
			print "$page: Skipping since -n was not specified.\n";
		    }
		}
		else {  # $protocol is http[s]
		    if ($redir_URL{$page}) {
			if ($redir_URL{$page} eq 'Y') {
			    $mech->requests_redirectable(\@redirs_y);
			}
			else { # already checked at parse time for illegal vals
			    $mech->requests_redirectable(\@redirs_n);
			}
		    }
		    else {
			$specific_redir_setting = 0;

			foreach $prefix_setting (@redir_prefix) {
			    ($URL_prefix, $redir_setting) 
			      = split(/ /, $prefix_setting);
			    if (substr($page, 0, length($URL_prefix))
				eq $URL_prefix) {
				if ($redir_setting eq 'Y') {
				    $mech->requests_redirectable(\@redirs_y);
				}
				else {
				    $mech->requests_redirectable(\@redirs_n);
				}
				$specific_redir_setting = 1;
				last;
			    }
			}

			if (not $specific_redir_setting) {
			    foreach $regexp_setting (@redir_regexp) {
				($URL_regexp, $redir_setting) 
				  = split(/ /, $regexp_setting);
				if ($page =~ /$URL_regexp/) {
				    if ($redir_setting eq 'Y') {
					$mech
					  ->requests_redirectable(\@redirs_y);
				    }
				    else {
					$mech
					  ->requests_redirectable(\@redirs_n);
				    }
				    $specific_redir_setting = 1;
				    last;
				}
			    }
			}

			if (not $specific_redir_setting) {
			    # Calling get() appears to create a copy of the
			    # object (for use with the next link) that remembers
			    # the setting of requests_redirectable at the time
			    # get() was called, making it ineffective for us to
			    # just remember the old setting prior to calling
			    # get() and restore that setting to the current
			    # object after the get().  Instead, we need to
			    # explicitly set requests_redirectable before each
			    # get().
			    if ($opt_r) {
				$mech->requests_redirectable(\@redirs_y);
			    }
			    else {
				$mech->requests_redirectable(\@redirs_n);
			    }
			}
		    }
		    
		    $mech->get($page);

		    if ($opt_v or $opt_V) {
			if ($mech->is_html() and 
			    b_is_on_site_a($site, $page) and 
			    not a_is_parent_of_b($parent, $page)) {
			    # Indent by the number of '/'s in this URL, plus 1
			    # if the URL doesn't end in '/', minus the
			    # indentation level the site's base URL would have
			    # if we didn't subtract here (e.g. 3 for
			    # "http://www.example.com/").
			    $indent_level = ($page =~ tr</><>) 
				            + ($page =~ m</$> ? 0 : 1)
				            - $base_indent_level;
			    indent($indent_level);
			}
			else {
			    # This is a true child of the parent page (e.g. an
			    # image, or a file like http://site/dir/file.html
			    # where the parent was http://site/dir/), or a
			    # cross-link to a page that's above the directory of
			    # the specified $site (e.g. a cross-link to
			    # http://site/ where $site was specified as
			    # http://site/~user/), so indent by the number of
			    # '/'s in the parent URL, plus 1 if the parent URL
			    # doesn't end in '/', minus the site's base indent
			    # level, plus one.
			    $indent_level = ($parent =~ tr</><>) 
				            + ($parent =~ m</$> ? 0 : 1)
				            - $base_indent_level + 1;
			    indent($indent_level);
			}
			# TBD: If the get of the current URL timed out or
			# otherwise took a long time, then when using -v,
			# waiting until here to print the page name and clear to
			# EOS misleadingly makes it look like the previous URL
			# is the one responsible for the delay.
			print "$page";
			$last_link_had_no_newline = 0;
			if ($opt_v) {
			    # Clear to end of screen to wipe out remnants of
			    # previous URLs longer than this one.  'ce', clear
			    # to end of line, isn't enough since if the line
			    # wrapped it'll only clear to the end of the first
			    # physical line.
			    print $terminal->Tputs('cd');
			}
		    }

		    if ($mech->status != 200) {
			$expected_status = 200;
			$expected_new_URL = "";
			$expected_new_URL_matches = 0;

			if ($expected{$page}) {
			    if ($expected{$page} =~ /(\S+) (\S+)/) {
				$expected_status = $1;
				$expected_new_URL = $2;
				if ($mech->response()->header("Location")) {
				    if ($expected_new_URL =~ /^PREFIX:(.+)$/) {
					$expected_new_URL_prefix = $1;
					if (substr($mech->response()
						   ->header("Location"), 0, 
                                               length($expected_new_URL_prefix))
					    eq $expected_new_URL_prefix) {
					    $expected_new_URL_matches = 1;
					}
				    }
				    elsif ($expected_new_URL 
					   =~ /^REGEXP:(.+)$/) {
					$expected_new_URL_regexp = $1;
					if ($mech->response()
					    ->header("Location") =~
					    /$expected_new_URL_regexp/) {
					    $expected_new_URL_matches = 1;
					}
				    }
				    else {
					if ($mech->response()
					    ->header("Location") 
					    eq $expected_new_URL) {
					    $expected_new_URL_matches = 1;
					}
				    }
				}
			    }
			    else {
				# In this case, $expected{$page} isn't two
				# tokens separated by a space, so it's just a
				# bare status code like 403.
				$expected_status = $expected{$page};
			    }
			}

			if ($mech->status == $expected_status and
			    (not ($mech->status >= 300 and $mech->status <= 399)
			     or
			     $expected_new_URL_matches)) {
			    # This non-200 was expected.
			    if ($opt_v) {
				backspace($indent_level, $page);
			    }
			    elsif ($opt_V) {
				print "\n";
			    }
			}
			else {
			    # This non-200 status was not expected.
			    if (!$opt_v and !$opt_V) {
				print "$page";
			    }
			    print ":" if not ($opt_c);
			    print " ", $mech->status;
			    
			    if ($mech->status >= 300 and 
				$mech->status <= 399 and
				$mech->response()->header("Location")) {
				print " ",$mech->response()->header("Location");
			    }

			    if (not $opt_V and 
				$parent ne $last_offending_parent) {
				print " [linked from $parent]";
				$last_offending_parent = $parent;
			    }
			    
			    print "\n";
			}
		    }
		    else {
			# Status 200.
			if ($opt_v) {
			    backspace($indent_level, $page);
			}
			elsif ($opt_V) {
			    print "\n";
			}

			if ($mech->is_html() and 
			    b_is_on_site_a($site, $page)) {
			    # When we visit an image, $mech->images() and
			    # $mech->links() still return further images and
			    # links from the HTML page we were just at, so only
			    # call them if the current link is an HTML page
			    # (that's on the site we started out on).

			    if ($opt_s) {
				# Make a site map entry.
				@page_lines = split(/\n/, $mech->content());
				$in_footer = 0;

				undef $created;
				undef $last_modified;

				foreach $line (@page_lines) {
				    if (not $in_footer) {
					if ($line =~ /FOOTER -->/) {
					    $in_footer = 1;
					}
				    }
				    elsif ($line =~ 
                                     /[Cc]reated: +([A-Za-z]+) +(\d+), (\d+)/) {
					$created = YYYY_MM_DD($1, $2, $3);
				    }
				    elsif ($line =~ 
                    /[Ll]ast.*?modifi(cation|ed): +([A-Za-z]+) +(\d+), (\d+)/) {
					$last_modified = YYYY_MM_DD($2, $3, $4);
				    }
				}

				if (not $last_modified) {
				    # TBD: Rewrite the timestamp to date format.
				    $last_modified 
				      = $mech->response()->last_modified;
				}

				$title_excluded = 0;

				foreach $exclude_title_RE (@exclude_title) {
				    if ($mech->title() =~ /$exclude_title_RE/) {
					$title_excluded = 1;
					last;
				    }
				}
				if (not $title_excluded) {
				    if ($page eq $site_map_url) {
					# Prevent site map entry _in_ the site
					# map from having a date one vers. back.
					$last_modified 
					  = strftime("%F", localtime());
				    }
				    $site_map{$page} = [$mech->title(), 
							$created, 
							$last_modified];
				}
			    }

			    foreach $image ($mech->images()) {
				$url = $image->url_abs();
				visit_link($url, $page);
			    }
			    
			    foreach $link ($mech->links()) {
				$url = $link->url_abs();
				$url =~ s/\#.*$//;  # remove anchor ref., if any
				if (not $opt_l or
				    b_is_on_site_a($site, $url)) {
				    visit_link($url, $page);
				}
			    }
			}
		    }
		    
		    $mech->back();
		}
	    }
	    elsif ($opt_V) {
		print ": $protocol protocol is unsupported.\n";
	    }
	}
	else {
	    print ": URL is malformed -- cannot determine protocol.\n";
	}
    }
}


sub visited {
    $page = shift;
    $parent = shift;

    if ($opt_1) {
	return $visited{$page};
    }
    else {
	return $visited{$page}{$parent};
    }
}


sub visited_set {
    # Use a separate set function rather than using one function with Perl
    # 5.6+'s experimental lvalue function declaration feature.
    $page = shift;
    $parent = shift;
    $value = shift;

    if ($opt_1) {
	$visited{$page} = $value;
    }
    else {
	$visited{$page}{$parent} = $value;
    }
}

			    
sub YYYY_MM_DD {
    my %month_names = qw(January 1 February 2 March 3 April 4 May 5 June 6
			 July 7 August 8 September 9 October 10 November 11
			 December 12);
    my $month_name = shift;
    my $day = shift;
    my $year = shift;
    
    return sprintf("%d-%02d-%02d", $year, $month_names{$month_name}, $day);
}

			    
## Main ########################################################################
$progname = basename($PROGRAM_NAME);

# Make stdout autoflush, since we print the URL with no newline before we try to
# visit it.
$OUTPUT_AUTOFLUSH = 1;

# Eliminate "used only once" warnings.
use vars qw($opt_1 $opt_c $opt_f $opt_l $opt_m $opt_n $opt_r $opt_s $opt_t 
	    $opt_T $opt_u $opt_v $opt_V);

if (not getopts('1cf:lmnrs:t:T:u:vV') or scalar(@ARGV) == 0) {
    print STDERR "Usage:\n $progname",
      " [-1clmnrvV] [-f <cfg>] [-s <sm>] [-t <t>] [-T <secs>] [-u <agent>]",
      " <URL>...\n";
    exit 1;
}

if (not $opt_u) {
    # TBD: Update the default User-Agent?
    $opt_u = "Mozilla/4.0 (compatible; MSIE 6.0; site_spider)";
}

$mech = WWW::Mechanize->new(agent => $opt_u, autocheck => 0);

@redirs_n = ();
@redirs_y = ("GET", "HEAD");

if ($opt_T) {
    $mech->timeout($opt_T);
}

if ($opt_v) {
    require POSIX;      # for getospeed(), etc.
    require Term::Cap;  # for Tputs(), etc.

    $termios = new POSIX::Termios;
    $termios->getattr();
    $ospeed = $termios->getospeed;
    $terminal = Term::Cap->Tgetent({TERM => undef, OSPEED => $ospeed});
    # TBD: Error handling if we aren't hooked up to a terminal.
}

if (not $opt_V) {
    $mech->quiet(1);
}

if ($opt_f) {
    $config_path = $opt_f;
}
else {
    $config_path = ".site_spider";
    if ($ENV{HOME}) {
	$config_path = "$ENV{HOME}/$config_path";
    }
}
if (open(CONFIG, $config_path)) {
    $line_num = 1;
    while (<CONFIG>) {
	chomp;
	if (/^\s*$/) {
	    # Ignore all-whitespace lines.
	} 
	elsif (/^\s*\#/) {
	    # Ignore comment lines.
	} 
	elsif (/^\s*EXCLUDE\s+(\S+)\s*$/i) {
	    $pattern = $1;
	    if ($pattern =~ /^PREFIX:(.+)$/) {
		push @exclude_prefix, $1;
	    }
	    elsif ($pattern =~ /^REGEXP:(.+)$/) {
		push @exclude_regexp, $1;
	    }
	    else {
		my_die "$config_path: Line $line_num is malformed: \"$ARG\".";
	    }
	}
	elsif (/^\s*EXCLUDE_TITLE\s+<([^>]*)>\s*$/i) {
	    push @exclude_title, $1;
	}
	elsif (/^\s*FOLLOW_REDIRECTS\s+(\S)+\s+(\S)+\s*$/i) {
	    $Y_or_N = uc($1);  # normalize to upper-case
	    $URL = $2;
	    if ($Y_or_N !~ /^[NY]$/) {
		my_die "$config_path: Line $line_num has illegal argument" .
		  " to FOLLOW_REDIRECTS: \"$ARG\".";
	    }
	    if ($URL =~ /^PREFIX:(.+)$/) {
		# TBD: Keep these in sorted order to allow faster lookup?
		push @redir_prefix, "$1 $Y_or_N";
	    }
	    elsif ($URL =~ /^REGEXP:(.+)$/) {
		# Can't speed up lookup by sorting these.
		push @redir_regexp, "$1 $Y_or_N";
	    }
	    else {
		$redir_URL{$URL} = $Y_or_N;
	    }
	}
	elsif (/^\s*LEVEL_ADJUST\s+(\S+)\s*([-+0-9]+)\s*$/i) {
	    push @level_adjust, [$1, $2];
	}
	elsif (/^\s*SITE_MAP\s+(\S+)\s*$/i) {
	    $site_map_url = $1;
	}
	elsif (/^\s*TITLE_MANGLE\s+<([^>]*)><([^}]*)>\s*$/i) {
	    push @title_mangle, [$1, $2, 0];
	}
	elsif (/^\s*TITLE_MANGLE\s+<([^>]*)><([^}]*)>\s*(\d+)\s*$/i) {
	    push @title_mangle, [$1, $2, $3];
	}
	elsif (/^\s*(\S+)(\s+(\S+)(\s+(\S+))?)$/) {
	    # Expected non-200 status.
	    $URL = $1;
	    $param_2 = $3;
	    $param_3 = $5;

	    if ($param_3) {
		$expected{$URL} = "$param_2 $param_3";
	    }
	    else {
		$expected{$URL} = $param_2;
	    }
	}
	else {
	    my_die "$config_path: Line $line_num is malformed: \"$ARG\".";
	}
	$line_num++;
    }
}

$last_offending_parent = "";

foreach $site (@ARGV) {
    $base_indent_level = ($site =~ tr</><>) + ($site =~ m</$> ? 0 : 1);
    visit_link($site, $site);
}

if ($opt_v and $last_link_had_no_newline) {
    print $terminal->Tputs('cd');  # clear to end of screen
}

if ($opt_s) {
    if (not $opt_t) {
	my_die
	  "-s specified to create site map, but -t template not specified.";
    }

    if (not open(TEMPLATE, $opt_t)) {
	my_die "Can't open '$opt_t' for reading: $OS_ERROR";
    }

    if (not open(SITE_MAP, ">$opt_s")) {
	my_die "Can't open '$opt_s' for writing: $OS_ERROR";
    }
    
    while (<TEMPLATE>) {
	if ($ARG =~ /^<!-- MAP ROWS -->/) {
	    last;
	}
	else {
	    print SITE_MAP $ARG;
	}
    }

    foreach $page (sort keys(%site_map)) {
	print SITE_MAP "    <TR CLASS=top>\n";

	$level = number_of_slashes($page) - number_of_slashes($ARGV[0]);

	foreach $list_ref (@level_adjust) {
	    if ($page =~ $list_ref->[0]) {
		$level += $list_ref->[1];
	    }
	}        

	if ($level < 1) {
	    print SITE_MAP "     <TD><!-- 0000000000 -->";
	}
	else {
	    print SITE_MAP "     <TD><!-- ";
	    printf SITE_MAP "%010d", $row_number;
	    print SITE_MAP " --><TABLE CLASS=unbordered><TR>";
	    print SITE_MAP '<TD STYLE="width: ', $level, 'em"></TD>', "\n";
	    print SITE_MAP "      <TD>";
	    if ($level % 3 == 1) {
		print SITE_MAP "&#x25CF;";  # black (filled) circle
	    }
	    elsif ($level %3 == 2) {
		print SITE_MAP "&#x25CB;";  # white (empty) circle
	    }
	    else {  # $level %3 == 0
		print SITE_MAP "&#x25AA;";  # black small square
	    }
	    print SITE_MAP "&nbsp;</TD><TD>";
	}
	print SITE_MAP "<A HREF=\n";
	print SITE_MAP '       "', $page, '"', "\n";
	print SITE_MAP "       >";

	$title = $site_map{$page}->[0];

	foreach $list_ref (@title_mangle) {
	    if ($level >= $list_ref->[2]) {
		$title =~ s/$list_ref->[0]/$list_ref->[1]/g;
	    }
	}
	print SITE_MAP "$title</A>\n";  # TBD: HTML-escaping

	if ($level < 1) {
	    print SITE_MAP "     </TD>\n";
	}
	else {
	    print SITE_MAP "      </TD></TR></TABLE></TD>\n";
	}
	print SITE_MAP "     <TD>", $site_map{$page}->[1], "</TD>\n";
	print SITE_MAP "     <TD>", $site_map{$page}->[2], "</TD>\n";
	print SITE_MAP "    </TR>\n";

	$row_number++;
    }

    while (<TEMPLATE>) {
	if ($ARG =~ /^Last modified: /) {
	    $date = strftime("%B %e, %Y", localtime());
	    $ARG = "Last modified: $date\n";
	}
	print SITE_MAP $ARG;
    }
}