#!/usr/local/bin/perl

use Time::ParseDate;
use strict 'vars';
# after 30 minutes, we consider this a new session
use constant MAX_INTERVAL => 60*30;  
my (%HITS,%INT_NUMERATOR,%INT_DENOMINATOR,%POLITE,%LAST,$HITS);

while (<>) {
    my($host,$junk,$junk,$date,$request,$URL) 
	= /^(\S+) (\S+) (\S+) \[([^\]]+)\] "(\w+) (\S+).*"/;
    next if $URL=~/\.(jpg|jpeg|gif|xbm)$/i;
    $HITS++;
    $HITS{$host}++;
    my $seconds = parsedate($date);
    if ($LAST{$host}) {
	my $interval = $seconds - $LAST{$host};
	if ($interval < MAX_INTERVAL) {
	    $INT_NUMERATOR{$host} += $interval;
	    $INT_DENOMINATOR{$host}++;
	}
    }
    $LAST{$host} = $seconds;
    $POLITE{$host}++ if $URL eq '/robots.txt';
    print STDERR $HITS,"\n" if ($HITS % 1000) == 0;
}

# print out, sorted by hits
print join("\t",qw/Client Robot Hits Interval Hit_Percent Index/),"\n";
foreach (sort {$HITS{$b}<=>$HITS{$a}} keys %HITS) {
    next unless $HITS{$_} >= 4;              # not enough total hits to mean much
    next unless $INT_DENOMINATOR{$_} >= 4;   # not enough consecutive hits to mean much

    my $mean_interval = $INT_NUMERATOR{$_}/$INT_DENOMINATOR{$_} || 0.0001; # to avoid divide by zero
    my $percent_hits = 100*($HITS{$_}/$HITS);
    my $index = $percent_hits/$mean_interval;

    print join("\t",
	       $_,
	       $POLITE{$_} > 0 ? 'yes' : 'no',
	       $HITS{$_},
	       $mean_interval,
	       $percent_hits,
	       $index
	       ),"\n";
}