1     #!/usr/bin/perl
2     
3     use lib "/home/merlyn/CPAN/lib";
4     
5     use LWP::UserAgent;
6     use HTML::Parser;
7     use URI::URL;
8     
9     ## begin configure
10    
11    @CHECK =                        # list of initial starting points
12      qw(http://www.teleport.com/~merlyn/);
13    sub PARSE {                     # verify existance, parse for further URLs
14      ## $_[0] is the absolute URL
15      $_[0] =~ m!^http://www\.(teleport|stonehenge)\.com/~merlyn! and not
16        $_[0] =~ /refindex/;
17    }
18    sub PING {                      # verify existence, but don't parse
19      ## $_[0] is the absolute URL
20      $_[0] =~ m!^(http|ftp|gopher):!;
21    }
22    
23    ## end configure
24    
25    {
26      package ParseLink;
27      @ISA = qw(HTML::Parser);
28    
29      sub start {                   # called by parse
30        my $this = shift;
31        my ($tag, $attr) = @_;
32        if ($tag eq "a") {
33          $this->{links}{$attr->{href}}++;
34        } elsif ($tag eq "img") {
35          $this->{links}{$attr->{src}}++;
36        }
37      }
38    
39      sub get_links {
40        my $this = shift;
41        sort keys %{$this->{links}};
42      }
43    }                               # end of ParseLink
44    
45    $ua = new LWP::UserAgent;
46    $ua->agent("hverify/1.0");
47    $ua->env_proxy;
48    
49    $| = 1;
50    
51    MAINLOOP:
52      while ($thisurl = shift @CHECK) {
53        $thisurl =~ s/%7e/~/ig;     # ugh :-)
54        next if $did{$thisurl}++;
55        if (PARSE $thisurl) {
56          warn "fetching $thisurl\n";
57          $request = new HTTP::Request('GET',$thisurl);
58          $response = $ua->request($request); # fetch!
59          
60          unless ($response->is_success) {
61            warn
62              "Cannot fetch $thisurl (status ",
63              $response->code, " ", $response->message,")\n";
64            next MAINLOOP;
65          }
66          next MAINLOOP unless $response->content_type =~ /text\/html/i;
67          $base = $response->base;
68          my $p = ParseLink->new;
69          $p->parse($response->content);
70          $p->parse(undef);
71          for $link ($p->get_links) {
72            $abs = url($link, $base)->abs;
73            warn "... $link => $abs\n";
74            push(@CHECK, $abs);
75          }
76          next MAINLOOP;
77        }
78        if (PING $thisurl) {
79          warn "verifying $thisurl\n";
80          for $method (qw(HEAD GET)) {
81            $request = new HTTP::Request($method,$thisurl);
82            $response = $ua->request($request); # fetch!
83            next MAINLOOP if $response->is_success; # ok
84          }
85          warn
86            "Cannot fetch $thisurl (status ",
87            $response->code, " ", $response->message,")\n";
88          next MAINLOOP;
89        }
90        warn "[skipping $thisurl]\n";
91      }