1 #!/usr/bin/perl -w 2 use strict; 3 $|=1; 4 5 use LWP::UserAgent; 6 use HTTP::Cookies; 7 use HTTP::Request::Common; 8 use HTML::LinkExtor; 9 10 my %LINKS # subset of %HTML::Tagset::linkElements 11 ( 12 'applet' => ['archive', 'codebase', 'code'], 13 'bgsound' => ['src'], 14 'body' => ['background'], 15 'embed' => ['src'], 16 'frame' => ['src'], 17 'iframe' => ['src'], 18 'ilayer' => ['background'], 19 'img' => ['src', 'lowsrc'], 20 'input' => ['src'], 21 'layer' => ['background', 'src'], 22 ## 'link' => ['href'], ## durn, some of these are stylesheets 23 'script' => ['src'], 24 'table' => ['background'], 25 'td' => ['background'], 26 'th' => ['background'], 27 'tr' => ['background'], 28 ); 29 30 my $ua = LWP::UserAgent->new; 31 $ua->env_proxy; 32 $ua->agent("dltime/1.00 ".$ua->agent); # identify ourselves 33 $ua->cookie_jar(HTTP::Cookies->new); # capture cookies if needed 34 35 report($_) for @ARGV; 36 37 exit 0; 38 39 sub report { 40 my $start = shift; 41 42 my @todo = ["", $start]; 43 my %done; 44 45 while (@todo) { 46 my ($refer, $url) = @{shift @todo}; 47 next if exists $done{$url}; 48 49 my $request = GET $url, [referer => $refer]; 50 my $response = $ua->simple_request($request); 51 52 if ($response->is_success) { 53 $done{$url} = length (my $content = $response->content); 54 55 next if $response->content_type ne "text/html"; 56 57 my $base = $response->base; # relative URLs measured # relative to here 58 my $p = HTML::LinkExtor->new(undef, $base) or die; 59 $p->parse($content); 60 $p->eof; 61 for my $link ($p->links) { 62 my ($tag, %attr) = @$link; 63 if ($LINKS{$tag}) { 64 for (@{$LINKS{$tag}}) { 65 next unless exists $attr{$_}; 66 next unless length (my $a = $attr{$_}); 67 ## print "$base $tag $_ => $a\n"; ## debug 68 push @todo, [$base, $a]; 69 } 70 } 71 } 72 73 } elsif ($response->is_redirect) { 74 $done{$url} = length $response->content; # this counts 75 my $location = $response->header('location') or next; 76 push @todo, [$url, $location]; # but get this too 77 } elsif ($response->is_error) { 78 print "$url ERROR: ", $response->status_line, "\n"; 79 } 80 81 } # end of outer loop 82 83 { 84 my $total = 0; 85 86 print "$start =>\n"; 87 for my $url (sort { $done{$b} <=> $done{$a} } keys %done) { 88 $total += $done{$url}; 89 printf " %10d %s\n", $done{$url}, $url; 90 } 91 printf " %10d TOTAL\n", $total; 92 printf " %10.0f seconds at 28.8\n\n", $total/2000; 93 } 94 95 }