#!/usr/bin/perl # # site_spider # # AUTHOR: # Dan Harkless # # COPYRIGHT: # This file is Copyright (C) 2025 by Dan Harkless, and is released under the # GNU General Public License . # # USAGE: # % site_spider [-1clmnrvV] [-f ] [-s ] [-t ] [-u ] ... # # DESCRIPTION: # Spiders through a site, given a starting URL. Reports links that don't # result in a 200 OK. Links that are expected to be broken, or to redirect, # can be marked as such in a config file, to prevent their reporting. Can # also generate a site map. # # REQUIREMENTS: # site_spider requires WWW::Mechanize, available from CPAN. Note that # WWW::Mechanize may require you to upgrade your versions of (or will require # you to install, if you don't have them already) HTML::Parser, LWP, and URI. # # COMMANDLINE OPTIONS: # -1 # Normally if there's a URL that's linked to from multiple pages on the # site, site_spider will visit it multiple times, so that if, say, the link # is 404, we can report all the pages where it appears. If you'd like # site_spider to take less time, you can use the -1 option to visit each # link only once, rather than visiting the link once for each unique page # that links to it. (After fixing the first page that contained the 404 # link found with 'site_spider -1', you can either wait until next time # site_spider runs to find any others, or proactively search through the # rest of your pages with some other tool like grep.) # # -c # Output non-200 links without the usual ":" after them, to make it easier # to copy & paste site_spider's output into the ~/.site_spider config file # (for expected 30x statuses). # # -f # By default, site_spider will look for its config file in # $HOME/.site_spider, if $HOME is set, or else ./.site_spider. To override # this, specify a file with -f. # # -l # Visit local URLs on the specified site only -- do not visit offsite URLs. # # -m # Try to follow mailto: links. You'll have to set the SENDMAIL environment # variable if your copy of sendmail (or a sendmail-mimicking wrapper) is not # in one of the standard locations. In my testing WWW::Mechanize always # returned status 400, though. # # -n # Try to follow news: links. You may have to create a .libnetrc file (see # Net::Config) for this to work. I have not been able to successfully test # WWW::Mechanize's support for this. # # -r # Follow redirects. This means that even if a URL gets a non-200 status -- # specifically, a 30x status -- this will not cause site_spider to report it # unless the potential chain of redirects ends with a non-200 status. Note # that if you always use -r, in the case of redirects that are only in place # for awhile, followed later by a 404, you make it harder for yourself to # determine where the new home of the page is, since you don't get a # heads-up until the chain is broken. A better practice is to keep track in # your .site_spider file of known redirects that you link to, and their # expected targets (see the CONFIG FILE section below). # # -s # Create a site map and write it to (which will be # overwritten without warning if it already exists). Special processing # occurs in the footers of pages. If a line includes the HTML comment # fragment "FOOTER -->", we assume we're in the footer. In there, if a line # includes the regular expression /[Cc]reated: +([A-Za-z]+) +(\d+), (\d+)/, # That " , " date will be included in the # "Created" column of the site map, reformatted to YYYY-MM-DD. If a footer # line has regexp /[Ll]ast.*?modifi(cation|ed): +([A-Za-z]+) +(\d+), (\d+)/, # that date will be include for the URL in the site map's "Last Modified" # column, again, reformatted to YYYY-MM-DD. # # -t # Must be specified together with -s. must be an HTML # file with all the content of the site map page except for the actual link # rows. In place of the link rows must be the line "", # which will be replaced. Also, in the lines following the site map table, # any line matching the regexp /^Last modified: / will be replaced by a line # with the current date, of the form "Last modified: August 27, 2025\n". # # -T # Timeout for connecting to a link. Defaults to 180 seconds if not # specified. # # -u # Specify the User-Agent header to use. By default site_spider uses # "Mozilla/4.0 (compatible; MSIE 6.0; site_spider)". This should allow # site_spider to check links on crappy sites that don't let you in unless # you're running Internet Explorer. If that User-Agent string isn't working # for you, you can specify a different one with -u. # # -v # Verbose. Temporarily prints each URL that's being visited (and then # backspaces to print the next one, in-place). Requires Term::Cap to be # installed (available from CPAN). # # -V # More verbose. Permanently prints each URL that's being visited. # # CONFIG FILE: # The .site_spider file consists of lines with three parameters, separated by # spaces or tabs. The first parameter specifies the URL(s) in question. # Lines' actions are determined by the second parameter and any argument # (which are case-insensitive): # # EXCLUDE pattern # To avoid visiting, printing, or including URLs in the site map, specify # them with "EXCLUDE" as the 1st param., and either a prefix to match, e.g.: # # EXCLUDE PREFIX:https://www.site.tld/do_not_go_here/ # # or a regular expression, as in: # # EXCLUDE REGEXP:^https?://(www\.)?site\.tld/do_not_include_files/.*?[^/]$ # # or a , as in: # # EXCLUDE REGEXP:^https?://(www\.)?site\.tld/do_not_include_files/.*?[^/]$ # # EXCLUDE_TITLE # Pages can also be omitted from the site map (though they do get visited) # if their titles match the given regular expression, which must be # surrounded by angle brackets. For example: # # EXCLUDE_TITLE # # FOLLOW_REDIRECTS (N|Y) pattern # This option overrides the setting of the -r commandline option for a given # URL or set of URLs matched by a prefix or regular expression. For # example, if you want site_spider to follow redirects on the URL # "http://site.tld/" even when -r isn't specified, you would include the # following line in your .site_spider file: # # FOLLOW_REDIRECTS Y http://site.tld/ # # If you wanted site_spider to avoid following redirects for all URLs on # https://www.site.tld/, you could specify the following: # # FOLLOW_REDIRECTS N PREFIX:https://www.site.tld/ # # Regular expressions can also be used, as in: # # FOLLOW_REDIRECTS N REGEXP:^https?://(www\.)?site\.tld/ # # If multiple FOLLOW_REDIRECTS setting lines can match a given URL, which # setting takes precedence is undefined. # # The 'N' or 'Y' argument to this option can also be specified as lower-case # 'n' or 'y'. # # LEVEL_ADJUST regexp adjustment # I have pages on my site that are generated by scripts using PATH_INFO, # which causes them to be indented an extra level due to the extra '/' in # the URLs. This can be corrected by specifying a regexp (note prefixes are # not supported on this one; also note that the slashes in the regexp below # are directory delimiters, not a /.../ surrounding the regexp as in Perl # syntax), plus an adjustment amount, as in: # # LEVEL_ADJUST /image_album\.pd/ -1 # # TITLE_MANGLE [starting_level] # For use with -s, when constructing a site map. Each TITLE_MANGLE line can # include a from_regexp and a to_regexp to transform it into (using the Perl # construct "s/from_regexp/to_regexp/g"). Note that the angle brackets # above are not to denote that "from_regexp" and "to_regexp" are variables # -- they must be typed explicitly (since the regexps can include unencoded # space characters). If starting_level is specified, the mangling will only # be done on pages with that level of nesting in the site map, and above, # where 0 denotes the top page's level. An example would be: # # TITLE_MANGLE <^Dan Harkless' ><> 1 # # The titles on my site generally start with "Dan Harkless' ", but repeating # that for every link in the site map would be silly. It's meaningful on # the first link, "Dan Harkless' Home Page", though, so we only apply the # mangling to level 1 and above. # # I also have pages whose titles start with "Dan Harkless' Concert # Reviews: ". Because the TITLE_MANGLE lines are processed in the order # they appear, the following line: # # TITLE_MANGLE <^Concert Reviews: ><> # # would need to either come _after_ the one that removes "Dan Harkless' ", # or else come first, and include that text, as in: # # TITLE_MANGLE <^Dan Harkless' Concert Reviews: ><> # TITLE_MANGLE <^Dan Harkless' ><> # # Replacement regexps are not limited to blank, e.g.: # # TITLE_MANGLE # # url http_response_code [redirect_url] # Tells site_spider to expect an HTTP response code other than 200 for the # URL specified as the 1st parameter. The 2nd parameter is what code is # expected, e.g. 404 or 301. In the case of a 3xx redirect response code # like 301, a 3rd parameter can optionally be specified with the URL the # original URL is expected to redirect to (and site_spider will check that). # For example: # # https://site.tld/dir/known_broken.html 404 # https://site.tld/photos/ 301 https://site.tld/image_album.pd/photos/ # # Comments are also possible in site_spider's config file -- any lines where # the first non-whitespace character is a '#' will be ignored. (Comments # cannot be appended to lines with active configuration directives.) # # EXAMPLE: # % site_spider -1lv -s site_map.html -t template.html https://site.tld/ # # TO DO: # Add a "cleanup" option for site_spider to report any URLs in the # .site_spider file that were never encountered while spidering the site. # # Add an option to accept and present cookies, to deal with sites like # skweezer.com that give you an error response if you don't? # # Add more flexible ways to get the Created and Last Modified dates for URLs? # # Add an option to generate a Google Sitemap. # # Add an option to generate an RSS feed. # # DATE MODIFICATION # ========== ================================================================== # 2025-10-01 My image_album script outputs the HTTP header as early as it can, # so that if my code or module code outputs any error messages, they # won't cause non-conforming HTTP. Unfortunately this means that # when it's passed a non-existent (e.g. deleted or renamed) album # directory, it reports the error with HTTP status 200. This causes # nonexistent image_album URLs to be propagated into the new version # of the site map by being seen in the existing version. Can't # EXCLUDE the site map URL, since we want the map link to appear in # the map. Therefore, added the ability to EXCLUDE_TITLE, which # takes a regexp that can match "ERROR" pages, for instance. # 2025-09-04 Added TO DOs for Google Sitemaps and RSS. # 2025-09-03 Updated documentation header. # 2025-08-30 Added ""-style row number comments to the start # of the link s, so that a simple alphasort can be done in JS. # 2025-08-27 Added -t option and completed initial full implementation of -s. # 2025-08-27 Added LEVEL_ADJUST config file option. # 2025-08-26 Added EXCLUDE and TITLE_MANGLE config file options. # 2025-08-25 At some point, $terminal->Tputs('cub1') stopped working to do # backspacing past a line-wrap, and now we need to simply use "\b". # 2025-08-18 WWW::Mechanize->new() now needs autocheck => 0 to not die on 404s. # 2009-02-27 Original. ## Modules used ################################################################ use English qw(-no_match_vars); # allow use of names like @ARG rather than @_ use File::Basename; # for basename() use Getopt::Std; # for getopts() use POSIX qw(strftime); use WWW::Mechanize; # for web-fetching and link-extracting routines # Output warnings, but not the bogus "Deep recursion" one. Pre-5.6.0 Perls # don't understand this syntax, so we use if.pm (available from CPAN) here. use if $] >= 5.006, warnings; no if $] >= 5.006, warnings => 'recursion'; # Use UTF-8 for standard and user-opened filehandles. May require installing a # perl-open package, or similar: use open qw(:std :encoding(UTF-8)); # TBD: Use one of the robots-parsing modules and optionally obey our own robots? ## Subroutines ################################################################# sub a_is_parent_of_b { my $a = shift; my $b = shift; if ($a eq $b) { # URL can't be a parent of itself -- use b_is_on_site_a() if you want # that case to return true. return 0; } elsif ($a =~ m) { # URL $a ends with a slash. For example: # a_is_parent_of_b("http://cool.tld/", # "http://cool.tld/page.html") == 1 # a_is_parent_of_b("http://cool.tld/", # "http://cool.tld/images/cow.jpg") == 1 return substr($b, 0, length($a)) eq $a; } else { # URL $a doesn't end with a slash. For example: # a_is_parent_of_b("http://lame.tld/home.asp", # "http://lame.tld/page.asp") == 0 # a_is_parent_of_b("http://lame.tld/home.asp", # "http://lame.tld/images/cow.jpg") == 1 $num_slashes_in_a = ($a =~ tr<>); $num_slashes_in_b = ($b =~ tr<>); $a =~ s; # remove anything after final directory return substr($b, 0, length($a)) eq $a && $num_slashes_in_a < $num_slashes_in_b; # 'and' would need return() } } sub b_is_on_site_a { my $a = shift; my $b = shift; if ($a eq $b) { # http://cool.tld/ is on site http://cool.tld/. return 1; } else { # http://lame.tld/page.asp is on site http://lame.tld/home.asp. $a =~ s; # remove anything after final directory return a_is_parent_of_b($a, $b); } } sub backspace { my $indent_level = shift; my $page = shift; # The following used to be required to get backspacing across a line-wrap to # work, but now, using the Tputs() output strings fails to work, and only "\b" # works. Not sure what changed, but will just go with it for now. # # if (defined($terminal->Tputs('cub1'))) { # # Backspace when in the leftmost column is undefined behavior in # # terminfo, so we can't just backspace here. 'cub1' (cursor_left) is # # defined for terminals that have the 'bw' (auto_left_margin) # # capability, though, so we'll use 'cub1' if it's defined. # $backspace_string = $terminal->Tputs('cub1'); # } # else { # # TBD: Surely there's something better to do than just backspacing in # # this case, which will fail to wrap back around past the leftmost # # column on, e.g., Mac OS X 10.5 xterm if its reverseWrap resource isn't # # explicitly set to true. # $backspace_string = $terminal->Tputs('kb'); # } for ($i = 1; $i <= $indent_level + length($page); $i++) { # print $backspace_string; print "\b"; } $last_link_had_no_newline = 1; } sub my_die { print STDERR "$progname: @ARG Aborting.\n"; exit 1; } sub indent { my $indent_level = shift; for ($i = 1; $i <= $indent_level; $i++) { print " "; } } sub number_of_slashes { my $url = shift; return $url =~ tr<>; } sub visit_link { my $page = shift; my $parent = shift; my $indent_level = 0; if (!visited($page, $parent)) { # TBD: Have 3rd level of verbosity outputting matching PREFIX / REGEXP? foreach $prefix_setting (@exclude_prefix) { if (substr($page, 0, length($prefix_setting)) eq $prefix_setting) { if ($opt_V) { print "$page: PREFIX-excluded.\n"; } return; } } foreach $regexp_setting (@exclude_regexp) { if ($page =~ /$regexp_setting/) { if ($opt_V) { print "$page: REGEXP-excluded.\n"; } return; } } visited_set($page, $parent, 1); if ($page =~ m<^([^:]+):>) { $protocol = $1; if ($mech->is_protocol_supported($protocol)) { if ($protocol eq "mailto" and !$opt_m) { if ($opt_V) { # $indent_level calculation explained below. $indent_level = ($parent =~ tr<>) + ($parent =~ m ? 0 : 1) - $base_indent_level + 1; indent($indent_level); print "$page: Skipping since -m was not specified.\n"; } } elsif ($protocol eq "news" and !$opt_n) { if ($opt_V) { # $indent_level calculation explained below. $indent_level = ($parent =~ tr<>) + ($parent =~ m ? 0 : 1) - $base_indent_level + 1; indent($indent_level); print "$page: Skipping since -n was not specified.\n"; } } else { # $protocol is http[s] if ($redir_URL{$page}) { if ($redir_URL{$page} eq 'Y') { $mech->requests_redirectable(\@redirs_y); } else { # already checked at parse time for illegal vals $mech->requests_redirectable(\@redirs_n); } } else { $specific_redir_setting = 0; foreach $prefix_setting (@redir_prefix) { ($URL_prefix, $redir_setting) = split(/ /, $prefix_setting); if (substr($page, 0, length($URL_prefix)) eq $URL_prefix) { if ($redir_setting eq 'Y') { $mech->requests_redirectable(\@redirs_y); } else { $mech->requests_redirectable(\@redirs_n); } $specific_redir_setting = 1; last; } } if (not $specific_redir_setting) { foreach $regexp_setting (@redir_regexp) { ($URL_regexp, $redir_setting) = split(/ /, $regexp_setting); if ($page =~ /$URL_regexp/) { if ($redir_setting eq 'Y') { $mech ->requests_redirectable(\@redirs_y); } else { $mech ->requests_redirectable(\@redirs_n); } $specific_redir_setting = 1; last; } } } if (not $specific_redir_setting) { # Calling get() appears to create a copy of the # object (for use with the next link) that remembers # the setting of requests_redirectable at the time # get() was called, making it ineffective for us to # just remember the old setting prior to calling # get() and restore that setting to the current # object after the get(). Instead, we need to # explicitly set requests_redirectable before each # get(). if ($opt_r) { $mech->requests_redirectable(\@redirs_y); } else { $mech->requests_redirectable(\@redirs_n); } } } $mech->get($page); if ($opt_v or $opt_V) { if ($mech->is_html() and b_is_on_site_a($site, $page) and not a_is_parent_of_b($parent, $page)) { # Indent by the number of '/'s in this URL, plus 1 # if the URL doesn't end in '/', minus the # indentation level the site's base URL would have # if we didn't subtract here (e.g. 3 for # "http://www.example.com/"). $indent_level = ($page =~ tr<>) + ($page =~ m ? 0 : 1) - $base_indent_level; indent($indent_level); } else { # This is a true child of the parent page (e.g. an # image, or a file like http://site/dir/file.html # where the parent was http://site/dir/), or a # cross-link to a page that's above the directory of # the specified $site (e.g. a cross-link to # http://site/ where $site was specified as # http://site/~user/), so indent by the number of # '/'s in the parent URL, plus 1 if the parent URL # doesn't end in '/', minus the site's base indent # level, plus one. $indent_level = ($parent =~ tr<>) + ($parent =~ m ? 0 : 1) - $base_indent_level + 1; indent($indent_level); } # TBD: If the get of the current URL timed out or # otherwise took a long time, then when using -v, # waiting until here to print the page name and clear to # EOS misleadingly makes it look like the previous URL # is the one responsible for the delay. print "$page"; $last_link_had_no_newline = 0; if ($opt_v) { # Clear to end of screen to wipe out remnants of # previous URLs longer than this one. 'ce', clear # to end of line, isn't enough since if the line # wrapped it'll only clear to the end of the first # physical line. print $terminal->Tputs('cd'); } } if ($mech->status != 200) { $expected_status = 200; $expected_new_URL = ""; $expected_new_URL_matches = 0; if ($expected{$page}) { if ($expected{$page} =~ /(\S+) (\S+)/) { $expected_status = $1; $expected_new_URL = $2; if ($mech->response()->header("Location")) { if ($expected_new_URL =~ /^PREFIX:(.+)$/) { $expected_new_URL_prefix = $1; if (substr($mech->response() ->header("Location"), 0, length($expected_new_URL_prefix)) eq $expected_new_URL_prefix) { $expected_new_URL_matches = 1; } } elsif ($expected_new_URL =~ /^REGEXP:(.+)$/) { $expected_new_URL_regexp = $1; if ($mech->response() ->header("Location") =~ /$expected_new_URL_regexp/) { $expected_new_URL_matches = 1; } } else { if ($mech->response() ->header("Location") eq $expected_new_URL) { $expected_new_URL_matches = 1; } } } } else { # In this case, $expected{$page} isn't two # tokens separated by a space, so it's just a # bare status code like 403. $expected_status = $expected{$page}; } } if ($mech->status == $expected_status and (not ($mech->status >= 300 and $mech->status <= 399) or $expected_new_URL_matches)) { # This non-200 was expected. if ($opt_v) { backspace($indent_level, $page); } elsif ($opt_V) { print "\n"; } } else { # This non-200 status was not expected. if (!$opt_v and !$opt_V) { print "$page"; } print ":" if not ($opt_c); print " ", $mech->status; if ($mech->status >= 300 and $mech->status <= 399 and $mech->response()->header("Location")) { print " ",$mech->response()->header("Location"); } if (not $opt_V and $parent ne $last_offending_parent) { print " [linked from $parent]"; $last_offending_parent = $parent; } print "\n"; } } else { # Status 200. if ($opt_v) { backspace($indent_level, $page); } elsif ($opt_V) { print "\n"; } if ($mech->is_html() and b_is_on_site_a($site, $page)) { # When we visit an image, $mech->images() and # $mech->links() still return further images and # links from the HTML page we were just at, so only # call them if the current link is an HTML page # (that's on the site we started out on). if ($opt_s) { # Make a site map entry. @page_lines = split(/\n/, $mech->content()); $in_footer = 0; undef $created; undef $last_modified; foreach $line (@page_lines) { if (not $in_footer) { if ($line =~ /FOOTER -->/) { $in_footer = 1; } } elsif ($line =~ /[Cc]reated: +([A-Za-z]+) +(\d+), (\d+)/) { $created = YYYY_MM_DD($1, $2, $3); } elsif ($line =~ /[Ll]ast.*?modifi(cation|ed): +([A-Za-z]+) +(\d+), (\d+)/) { $last_modified = YYYY_MM_DD($2, $3, $4); } } if (not $last_modified) { # TBD: Rewrite the timestamp to date format. $last_modified = $mech->response()->last_modified; } $title_excluded = 0; foreach $exclude_title_RE (@exclude_title) { if ($mech->title() =~ /$exclude_title_RE/) { $title_excluded = 1; last; } } if (not $title_excluded) { $site_map{$page} = [$mech->title(), $created, $last_modified]; } } foreach $image ($mech->images()) { $url = $image->url_abs(); visit_link($url, $page); } foreach $link ($mech->links()) { $url = $link->url_abs(); $url =~ s/\#.*$//; # remove anchor ref., if any if (not $opt_l or b_is_on_site_a($site, $url)) { visit_link($url, $page); } } } } $mech->back(); } } elsif ($opt_V) { print ": $protocol protocol is unsupported.\n"; } } else { print ": URL is malformed -- cannot determine protocol.\n"; } } } sub visited { $page = shift; $parent = shift; if ($opt_1) { return $visited{$page}; } else { return $visited{$page}{$parent}; } } sub visited_set { # Use a separate set function rather than using one function with Perl # 5.6+'s experimental lvalue function declaration feature. $page = shift; $parent = shift; $value = shift; if ($opt_1) { $visited{$page} = $value; } else { $visited{$page}{$parent} = $value; } } sub YYYY_MM_DD { my %month_names = qw(January 1 February 2 March 3 April 4 May 5 June 6 July 7 August 8 September 9 October 10 November 11 December 12); my $month_name = shift; my $day = shift; my $year = shift; return sprintf("%d-%02d-%02d", $year, $month_names{$month_name}, $day); } ## Main ######################################################################## $progname = basename($PROGRAM_NAME); # Make stdout autoflush, since we print the URL with no newline before we try to # visit it. $OUTPUT_AUTOFLUSH = 1; # Eliminate "used only once" warnings. use vars qw($opt_1 $opt_c $opt_f $opt_l $opt_m $opt_n $opt_r $opt_s $opt_t $opt_T $opt_u $opt_v $opt_V); if (not getopts('1cf:lmnrs:t:T:u:vV') or scalar(@ARGV) == 0) { print STDERR "Usage:\n $progname", " [-1clmnrvV] [-f ] [-s ] [-t ] [-T ] [-u ]", " ...\n"; exit 1; } if (not $opt_u) { # TBD: Update the default User-Agent? $opt_u = "Mozilla/4.0 (compatible; MSIE 6.0; site_spider)"; } $mech = WWW::Mechanize->new(agent => $opt_u, autocheck => 0); @redirs_n = (); @redirs_y = ("GET", "HEAD"); if ($opt_T) { $mech->timeout($opt_T); } if ($opt_v) { require POSIX; # for getospeed(), etc. require Term::Cap; # for Tputs(), etc. $termios = new POSIX::Termios; $termios->getattr(); $ospeed = $termios->getospeed; $terminal = Term::Cap->Tgetent({TERM => undef, OSPEED => $ospeed}); # TBD: Error handling if we aren't hooked up to a terminal. } if (not $opt_V) { $mech->quiet(1); } if ($opt_f) { $config_path = $opt_f; } else { $config_path = ".site_spider"; if ($ENV{HOME}) { $config_path = "$ENV{HOME}/$config_path"; } } if (open(CONFIG, $config_path)) { $line_num = 1; while () { chomp; if (/^\s*$/) { # Ignore all-whitespace lines. } elsif (/^\s*\#/) { # Ignore comment lines. } elsif (/^\s*EXCLUDE\s+(\S+)\s*$/i) { $pattern = $1; if ($pattern =~ /^PREFIX:(.+)$/) { push @exclude_prefix, $1; } elsif ($pattern =~ /^REGEXP:(.+)$/) { push @exclude_regexp, $1; } else { my_die "$config_path: Line $line_num is malformed: \"$ARG\"."; } } elsif (/^\s*EXCLUDE_TITLE\s+<([^>]*)>\s*$/i) { push @exclude_title, $1; } elsif (/^\s*FOLLOW_REDIRECTS\s+(\S)+\s+(\S)+\s*$/i) { $Y_or_N = uc($1); # normalize to upper-case $URL = $2; if ($Y_or_N !~ /^[NY]$/) { my_die "$config_path: Line $line_num has illegal argument" . " to FOLLOW_REDIRECTS: \"$ARG\"."; } if ($URL =~ /^PREFIX:(.+)$/) { # TBD: Keep these in sorted order to allow faster lookup? push @redir_prefix, "$1 $Y_or_N"; } elsif ($URL =~ /^REGEXP:(.+)$/) { # Can't speed up lookup by sorting these. push @redir_regexp, "$1 $Y_or_N"; } else { $redir_URL{$URL} = $Y_or_N; } } elsif (/^\s*LEVEL_ADJUST\s+(\S+)\s*([-+0-9]+)\s*$/i) { push @level_adjust, [$1, $2]; } elsif (/^\s*TITLE_MANGLE\s+<([^>]*)><([^}]*)>\s*$/i) { push @title_mangle, [$1, $2, 0]; } elsif (/^\s*TITLE_MANGLE\s+<([^>]*)><([^}]*)>\s*(\d+)\s*$/i) { push @title_mangle, [$1, $2, $3]; } elsif (/^\s*(\S+)(\s+(\S+)(\s+(\S+))?)$/) { # Expected non-200 status. $URL = $1; $param_2 = $3; $param_3 = $5; if ($param_3) { $expected{$URL} = "$param_2 $param_3"; } else { $expected{$URL} = $param_2; } } else { my_die "$config_path: Line $line_num is malformed: \"$ARG\"."; } $line_num++; } } $last_offending_parent = ""; foreach $site (@ARGV) { $base_indent_level = ($site =~ tr<>) + ($site =~ m ? 0 : 1); visit_link($site, $site); } if ($opt_v and $last_link_had_no_newline) { print $terminal->Tputs('cd'); # clear to end of screen } if ($opt_s) { if (not $opt_t) { my_die "-s specified to create site map, but -t template not specified."; } if (not open(TEMPLATE, $opt_t)) { my_die "Can't open '$opt_t' for reading: $OS_ERROR"; } if (not open(SITE_MAP, ">$opt_s")) { my_die "Can't open '$opt_s' for writing: $OS_ERROR"; } while (