Programming in Perl By Randal Schwartz Web Techniques, June 1997 Web Techniques grants permission to use these listings for private or commercial use provided that credit to Web Techniques and the author is maintained within the comments of the source. For questions, contact editors@web-techniques.com. Listing One #!/home/merlyn/bin/perl -Tw use strict; use LWP::UserAgent; use HTML::Parser; use URI::URL; ## begin configure my @CHECK = # list of initial starting points qw(http://www.stonehenge.com/index.html); sub PARSE { # verify existence, parse for further URLs ## $_[0] is the absolute URL $_[0] =~ m!^http://www\.stonehenge\.com/! and not $_[0] =~ /refindex|col\d\d\.html|fors/; } sub PING { # verify existence, but don't parse ## $_[0] is the absolute URL $_[0] =~ m!^(http|ftp|gopher)://! and not $_[0] =~ m!perl\.com/CPAN/!; # presume all CPAN refs are good } ## end configure (no user-servicable parts below this line) BEGIN { package ParseLink; @ParseLink::ISA = qw(HTML::Parser); sub set_line { # $instance->set_line(nnn) my $self = shift; $self->{Line} = shift; } ## $self->{Links} = { ## "url" => { "line" => "count", "line" => "count" ... }, ... ## }; sub start { # called by parse my $self = shift; my ($tag, $attr) = @_; my $link; $link = $attr->{href} if $tag eq "a"; $link = $attr->{src} if $tag eq "img"; if (defined $link) { $self->{Links}{$link}{$self->{Line}}++; } } sub get_links { # $instance->get_links() my $self = shift; $self->{Links}; } } # end of ParseLink my $ua = new LWP::UserAgent; $ua->agent("hverify/2.0"); $ua->env_proxy; $| = 1; ## global database my %URL_LIST = (); ## format: ## $URL_LIST{"some url"} = { ## Source => { "where" => "count", "where" => "count", ... }, ## Dest => { "where" => "count", "where" => "count", ... }, ## Base => "base", ## if base != url ## Status => "Whatever", ## undef if not checked yet ## } ## prime the pump for (@CHECK) { $URL_LIST{$_}{Source}{"[requested]"}++; } ## now walk it { my @this_time = grep !defined $URL_LIST{$_}{Status}, keys %URL_LIST; last unless @this_time; URL: for my $url (@this_time) { if (PARSE $url) { ## print "Fetching $url\n"; my $request = new HTTP::Request('GET', $url); my $response = $ua->request($request); # fetch! unless ($response->is_success) { $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; next URL; } unless ($response->content_type =~ /text\/html/i) { $URL_LIST{$url}{Status} = "Verified (content not HTML)"; next URL; } my $base = $response->base; $URL_LIST{$url}{Base} = $base if $base ne $url; my $p = ParseLink->new; { my @content = $response->content =~ /(.*\n?)/g; my $line = 1; { last unless @content; $p->set_line($line); # tell it the line number $p->parse(shift @content); # and parse it $line++; redo; } } $p->parse(undef); # signal the end of parse my $links = $p->get_links; # key is relative url, value is href for my $link (sort keys %$links) { my $abs = url($link, $base)->abs; ## requested url is used for forward relative xref links, ## but actual url after redirection is used for backwards links. my ($forward_rel, $backward_rel) = do { local ($^W) = 0; # workaround for buglet map { $_ || "." } url($abs, $url)->rel, url($base, $abs)->rel; }; my $where = $links->{$link}; # key is line number, val is count for my $line (sort keys %$where) { $URL_LIST{$abs}{Source}{"$backward_rel at line $line"} += $where->{$line}; $URL_LIST{$url}{Dest}{"$forward_rel at line $line"} += $where->{$line}; } } $URL_LIST{$url}{Status} = "Verified (and parsed)"; next URL; } if (PING $url) { ## print "Verifying $url\n"; my $response; for my $method (qw(HEAD GET)) { my $request = new HTTP::Request($method,$url); $response = $ua->request($request); # fetch! if ($response->is_success) { $URL_LIST{$url}{Status} = "Verified (contents not examined)"; next URL; } } $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; next URL; } $URL_LIST{$url}{Status} = "Skipped"; next URL; } redo; } for my $url (sort keys %URL_LIST) { my $entry = $URL_LIST{$url}; # href my $status = $entry->{Status}; my $base = $entry->{Base}; print "$url"; print " (base $base)" if defined $base; print ":\n status: $status\n"; my $sources = $entry->{Source}; for my $source (sort keys %$sources) { print " from $source\n"; } my $dests = $entry->{Dest}; for my $dest (sort keys %$dests) { print " to $dest\n"; } }