Webmaster's Domain
By Lincoln Stein
Web Techniques,  May  1998

Web Techniques grants permission to use these listings for private or 
commercial use provided that credit to Web Techniques and the author is 
maintained within the comments of the source. For questions, contact
editors@web-techniques.com.


#!/usr/local/bin/perl

use Time::ParseDate;
use strict 'vars';
# after 30 minutes, we consider this a new session
use constant MAX_INTERVAL => 60*30;  
my (%HITS,%INT_NUMERATOR,%INT_DENOMINATOR,%POLITE,%LAST,$HITS);

while (<>) {
    my($host,$junk,$junk,$date,$request,$URL) 
	= /^(\S+) (\S+) (\S+) \[([^\]]+)\] "(\w+) (\S+).*"/;
    next if $URL=~/\.(jpg|jpeg|gif|xbm)$/i;
    $HITS++;
    $HITS{$host}++;
    my $seconds = parsedate($date);
    if ($LAST{$host}) {
	my $interval = $seconds - $LAST{$host};
	if ($interval < MAX_INTERVAL) {
	    $INT_NUMERATOR{$host} += $interval;
	    $INT_DENOMINATOR{$host}++;
	}
    }
    $LAST{$host} = $seconds;
    $POLITE{$host}++ if $URL eq '/robots.txt';
    print STDERR $HITS,"\n" if ($HITS % 1000) == 0;
}

# print out, sorted by hits
print join("\t",qw/Client Robot Hits Interval Hit_Percent Index/),"\n";
foreach (sort {$HITS{$b}<=>$HITS{$a}} keys %HITS) {
    next unless $HITS{$_} >= 4;              # not enough total hits to mean much
    next unless $INT_DENOMINATOR{$_} >= 4;   # not enough consecutive hits to mean much

    my $mean_interval = $INT_NUMERATOR{$_}/$INT_DENOMINATOR{$_} || 0.0001; # to avoid divide by zero
    my $percent_hits = 100*($HITS{$_}/$HITS);
    my $index = $percent_hits/$mean_interval;

    print join("\t",
	       $_,
	       $POLITE{$_} > 0 ? 'yes' : 'no',
	       $HITS{$_},
	       $mean_interval,
	       $percent_hits,
	       $index
	       ),"\n";
}