#!/usr/local/bin/perl use Time::ParseDate; use strict 'vars'; # after 30 minutes, we consider this a new session use constant MAX_INTERVAL => 60*30; my (%HITS,%INT_NUMERATOR,%INT_DENOMINATOR,%POLITE,%LAST,$HITS); while (<>) { my($host,$junk,$junk,$date,$request,$URL) = /^(\S+) (\S+) (\S+) \[([^\]]+)\] "(\w+) (\S+).*"/; next if $URL=~/\.(jpg|jpeg|gif|xbm)$/i; $HITS++; $HITS{$host}++; my $seconds = parsedate($date); if ($LAST{$host}) { my $interval = $seconds - $LAST{$host}; if ($interval < MAX_INTERVAL) { $INT_NUMERATOR{$host} += $interval; $INT_DENOMINATOR{$host}++; } } $LAST{$host} = $seconds; $POLITE{$host}++ if $URL eq '/robots.txt'; print STDERR $HITS,"\n" if ($HITS % 1000) == 0; } # print out, sorted by hits print join("\t",qw/Client Robot Hits Interval Hit_Percent Index/),"\n"; foreach (sort {$HITS{$b}<=>$HITS{$a}} keys %HITS) { next unless $HITS{$_} >= 4; # not enough total hits to mean much next unless $INT_DENOMINATOR{$_} >= 4; # not enough consecutive hits to mean much my $mean_interval = $INT_NUMERATOR{$_}/$INT_DENOMINATOR{$_} || 0.0001; # to avoid divide by zero my $percent_hits = 100*($HITS{$_}/$HITS); my $index = $percent_hits/$mean_interval; print join("\t", $_, $POLITE{$_} > 0 ? 'yes' : 'no', $HITS{$_}, $mean_interval, $percent_hits, $index ),"\n"; }