#!/home/merlyn/bin/perl -Tw
 use strict;
 
 use LWP::UserAgent;
 use HTML::Parser;
 use URI::URL;
 
 ## begin configure
 
 my @CHECK =                     # list of initial starting points
   qw(http://www.stonehenge.com/index.html);
 sub PARSE {                     # verify existence, parse for further URLs
   ## $_[0] is the absolute URL
   $_[0] =~ m!^http://www\.stonehenge\.com/! and not
     $_[0] =~ /refindex|col\d\d\.html|fors/;
 }
 sub PING {                      # verify existence, but don't parse
   ## $_[0] is the absolute URL
   $_[0] =~ m!^(http|ftp|gopher)://! and not
     $_[0] =~ m!perl\.com/CPAN/!; # presume all CPAN refs are good
 }
 
 ## end configure (no user-servicable parts below this line)
 
 BEGIN {
   package ParseLink;
 
   @ParseLink::ISA = qw(HTML::Parser);
 
   sub set_line {                # $instance->set_line(nnn)
     my $self = shift;
     $self->{Line} = shift;
   }
 
   ## $self->{Links} = {
   ##    "url" => { "line" => "count", "line" => "count" ... }, ...
   ## };
   sub start {                   # called by parse
     my $self = shift;
     my ($tag, $attr) = @_;
     my $link;
     $link = $attr->{href} if $tag eq "a";
     $link = $attr->{src} if $tag eq "img";
     if (defined $link) {
       $self->{Links}{$link}{$self->{Line}}++;
     }
   }
 
   sub get_links {               # $instance->get_links()
     my $self = shift;
     $self->{Links};
   }
 }                               # end of ParseLink
 
 my $ua = new LWP::UserAgent;
 $ua->agent("hverify/2.0");
 $ua->env_proxy;
 
 $| = 1;
 
 ## global database
 my %URL_LIST = ();
 ## format:
 ## $URL_LIST{"some url"} = {
 ##   Source => { "where" => "count", "where" => "count", ... },
 ##   Dest => { "where" => "count", "where" => "count", ... },
 ##   Base => "base", ## if base != url
 ##   Status => "Whatever",  ## undef if not checked yet
 ## }
 
 ## prime the pump
 for (@CHECK) {
   $URL_LIST{$_}{Source}{"[requested]"}++;
 }
 
 ## now walk it
 
 {
   my @this_time = grep !defined $URL_LIST{$_}{Status}, keys %URL_LIST;
   last unless @this_time;
  URL:
   for my $url (@this_time) {
     if (PARSE $url) {
       ## print "Fetching $url\n";
       my $request = new HTTP::Request('GET', $url);
       my $response = $ua->request($request); # fetch!
       unless ($response->is_success) {
         $URL_LIST{$url}{Status} =
           "NOT Verified (status = ".($response->code).")";
         next URL;
       }
       unless ($response->content_type =~ /text\/html/i) {
         $URL_LIST{$url}{Status} = "Verified (content not HTML)";
         next URL;
       }
       my $base = $response->base;
       $URL_LIST{$url}{Base} = $base if $base ne $url;
       my $p = ParseLink->new;
       {
         my @content = $response->content =~ /(.*\n?)/g;
         my $line = 1;
         {
           last unless @content;
           $p->set_line($line);  # tell it the line number
           $p->parse(shift @content); # and parse it
           $line++;
           redo;
         }
       }
       $p->parse(undef);         # signal the end of parse
       my $links = $p->get_links; # key is relative url, value is href
       for my $link (sort keys %$links) {
         my $abs = url($link, $base)->abs;
         ## requested url is used for forward relative xref links,
         ## but actual url after redirection is used for backwards links.
         my ($forward_rel, $backward_rel) = do {
           local ($^W) = 0;      # workaround for buglet
           map { $_ || "." } url($abs, $url)->rel, url($base, $abs)->rel;
         };
         my $where = $links->{$link}; # key is line number, val is count
         for my $line (sort keys %$where) {
           $URL_LIST{$abs}{Source}{"$backward_rel at line $line"} +=
             $where->{$line};
           $URL_LIST{$url}{Dest}{"$forward_rel at line $line"} +=
             $where->{$line};
         }
       }
       $URL_LIST{$url}{Status} = "Verified (and parsed)";
       next URL;
     }
     if (PING $url) {
       ## print "Verifying $url\n";
       my $response;
       for my $method (qw(HEAD GET)) {
         my $request = new HTTP::Request($method,$url);
         $response = $ua->request($request); # fetch!
         if ($response->is_success) {
           $URL_LIST{$url}{Status} = "Verified (contents not examined)";
           next URL;
         }
       }
       $URL_LIST{$url}{Status} =
         "NOT Verified (status = ".($response->code).")";
       next URL;
     }
     $URL_LIST{$url}{Status} = "Skipped";
     next URL;
   }
   redo;
 }
 
 for my $url (sort keys %URL_LIST) {
   my $entry = $URL_LIST{$url};  # href
   my $status = $entry->{Status};
   my $base = $entry->{Base};
   print "$url";
   print " (base $base)" if defined $base;
   print ":\n  status: $status\n";
   my $sources = $entry->{Source};
   for my $source (sort keys %$sources) {
     print "  from $source\n";
   }
   my $dests = $entry->{Dest};
   for my $dest (sort keys %$dests) {
     print "  to $dest\n";
   }
 }