1 #!/usr/bin/perl 2 3 use lib "/home/merlyn/CPAN/lib"; 4 5 use LWP::UserAgent; 6 use HTML::Parser; 7 use URI::URL; 8 9 ## begin configure 10 11 @CHECK = # list of initial starting points 12 qw(http://www.teleport.com/~merlyn/); 13 sub PARSE { # verify existance, parse for further URLs 14 ## $_[0] is the absolute URL 15 $_[0] =~ m!^http://www\.(teleport|stonehenge)\.com/~merlyn! and not 16 $_[0] =~ /refindex/; 17 } 18 sub PING { # verify existence, but don't parse 19 ## $_[0] is the absolute URL 20 $_[0] =~ m!^(http|ftp|gopher):!; 21 } 22 23 ## end configure 24 25 { 26 package ParseLink; 27 @ISA = qw(HTML::Parser); 28 29 sub start { # called by parse 30 my $this = shift; 31 my ($tag, $attr) = @_; 32 if ($tag eq "a") { 33 $this->{links}{$attr->{href}}++; 34 } elsif ($tag eq "img") { 35 $this->{links}{$attr->{src}}++; 36 } 37 } 38 39 sub get_links { 40 my $this = shift; 41 sort keys %{$this->{links}}; 42 } 43 } # end of ParseLink 44 45 $ua = new LWP::UserAgent; 46 $ua->agent("hverify/1.0"); 47 $ua->env_proxy; 48 49 $| = 1; 50 51 MAINLOOP: 52 while ($thisurl = shift @CHECK) { 53 $thisurl =~ s/%7e/~/ig; # ugh :-) 54 next if $did{$thisurl}++; 55 if (PARSE $thisurl) { 56 warn "fetching $thisurl\n"; 57 $request = new HTTP::Request('GET',$thisurl); 58 $response = $ua->request($request); # fetch! 59 60 unless ($response->is_success) { 61 warn 62 "Cannot fetch $thisurl (status ", 63 $response->code, " ", $response->message,")\n"; 64 next MAINLOOP; 65 } 66 next MAINLOOP unless $response->content_type =~ /text\/html/i; 67 $base = $response->base; 68 my $p = ParseLink->new; 69 $p->parse($response->content); 70 $p->parse(undef); 71 for $link ($p->get_links) { 72 $abs = url($link, $base)->abs; 73 warn "... $link => $abs\n"; 74 push(@CHECK, $abs); 75 } 76 next MAINLOOP; 77 } 78 if (PING $thisurl) { 79 warn "verifying $thisurl\n"; 80 for $method (qw(HEAD GET)) { 81 $request = new HTTP::Request($method,$thisurl); 82 $response = $ua->request($request); # fetch! 83 next MAINLOOP if $response->is_success; # ok 84 } 85 warn 86 "Cannot fetch $thisurl (status ", 87 $response->code, " ", $response->message,")\n"; 88 next MAINLOOP; 89 } 90 warn "[skipping $thisurl]\n"; 91 }