Programming in Perl
By Randal Schwartz
Web Techniques, June 1997

Web Techniques grants permission to use these listings for private or 
commercial use provided that credit to Web Techniques and the author is 
maintained within the comments of the source. For questions, contact
editors@web-techniques.com.



Listing One 


 #!/home/merlyn/bin/perl -Tw
 use strict;
 
 use LWP::UserAgent;
 use HTML::Parser;
 use URI::URL;
 
 ## begin configure
 
 my @CHECK =                     # list of initial starting points
   qw(http://www.stonehenge.com/index.html);
 sub PARSE {                     # verify existence, parse for further URLs
   ## $_[0] is the absolute URL
   $_[0] =~ m!^http://www\.stonehenge\.com/! and not
     $_[0] =~ /refindex|col\d\d\.html|fors/;
 }
 sub PING {                      # verify existence, but don't parse
   ## $_[0] is the absolute URL
   $_[0] =~ m!^(http|ftp|gopher)://! and not
     $_[0] =~ m!perl\.com/CPAN/!; # presume all CPAN refs are good
 }
 
 ## end configure (no user-servicable parts below this line)
 
 BEGIN {
   package ParseLink;
 
   @ParseLink::ISA = qw(HTML::Parser);
 
   sub set_line {                # $instance->set_line(nnn)
     my $self = shift;
     $self->{Line} = shift;
   }
 
   ## $self->{Links} = {
   ##    "url" => { "line" => "count", "line" => "count" ... }, ...
   ## };
   sub start {                   # called by parse
     my $self = shift;
     my ($tag, $attr) = @_;
     my $link;
     $link = $attr->{href} if $tag eq "a";
     $link = $attr->{src} if $tag eq "img";
     if (defined $link) {
       $self->{Links}{$link}{$self->{Line}}++;
     }
   }
 
   sub get_links {               # $instance->get_links()
     my $self = shift;
     $self->{Links};
   }
 }                               # end of ParseLink
 
 my $ua = new LWP::UserAgent;
 $ua->agent("hverify/2.0");
 $ua->env_proxy;
 
 $| = 1;
 
 ## global database
 my %URL_LIST = ();
 ## format:
 ## $URL_LIST{"some url"} = {
 ##   Source => { "where" => "count", "where" => "count", ... },
 ##   Dest => { "where" => "count", "where" => "count", ... },
 ##   Base => "base", ## if base != url
 ##   Status => "Whatever",  ## undef if not checked yet
 ## }
 
 ## prime the pump
 for (@CHECK) {
   $URL_LIST{$_}{Source}{"[requested]"}++;
 }
 
 ## now walk it
 
 {
   my @this_time = grep !defined $URL_LIST{$_}{Status}, keys %URL_LIST;
   last unless @this_time;
  URL:
   for my $url (@this_time) {
     if (PARSE $url) {
       ## print "Fetching $url\n";
       my $request = new HTTP::Request('GET', $url);
       my $response = $ua->request($request); # fetch!
       unless ($response->is_success) {
         $URL_LIST{$url}{Status} =
           "NOT Verified (status = ".($response->code).")";
         next URL;
       }
       unless ($response->content_type =~ /text\/html/i) {
         $URL_LIST{$url}{Status} = "Verified (content not HTML)";
         next URL;
       }
       my $base = $response->base;
       $URL_LIST{$url}{Base} = $base if $base ne $url;
       my $p = ParseLink->new;
       {
         my @content = $response->content =~ /(.*\n?)/g;
         my $line = 1;
         {
           last unless @content;
           $p->set_line($line);  # tell it the line number
           $p->parse(shift @content); # and parse it
           $line++;
           redo;
         }
       }
       $p->parse(undef);         # signal the end of parse
       my $links = $p->get_links; # key is relative url, value is href
       for my $link (sort keys %$links) {
         my $abs = url($link, $base)->abs;
         ## requested url is used for forward relative xref links,
         ## but actual url after redirection is used for backwards links.
         my ($forward_rel, $backward_rel) = do {
           local ($^W) = 0;      # workaround for buglet
           map { $_ || "." } url($abs, $url)->rel, url($base, $abs)->rel;
         };
         my $where = $links->{$link}; # key is line number, val is count
         for my $line (sort keys %$where) {
           $URL_LIST{$abs}{Source}{"$backward_rel at line $line"} +=
             $where->{$line};
           $URL_LIST{$url}{Dest}{"$forward_rel at line $line"} +=
             $where->{$line};
         }
       }
       $URL_LIST{$url}{Status} = "Verified (and parsed)";
       next URL;
     }
     if (PING $url) {
       ## print "Verifying $url\n";
       my $response;
       for my $method (qw(HEAD GET)) {
         my $request = new HTTP::Request($method,$url);
         $response = $ua->request($request); # fetch!
         if ($response->is_success) {
           $URL_LIST{$url}{Status} = "Verified (contents not examined)";
           next URL;
         }
       }
       $URL_LIST{$url}{Status} =
         "NOT Verified (status = ".($response->code).")";
       next URL;
     }
     $URL_LIST{$url}{Status} = "Skipped";
     next URL;
   }
   redo;
 }
 
 for my $url (sort keys %URL_LIST) {
   my $entry = $URL_LIST{$url};  # href
   my $status = $entry->{Status};
   my $base = $entry->{Base};
   print "$url";
   print " (base $base)" if defined $base;
   print ":\n  status: $status\n";
   my $sources = $entry->{Source};
   for my $source (sort keys %$sources) {
     print "  from $source\n";
   }
   my $dests = $entry->{Dest};
   for my $dest (sort keys %$dests) {
     print "  to $dest\n";
   }
 }