Mailing List Archive

svn commit: r330188 - in /spamassassin/trunk: MANIFEST build/automc/buildbot_ready build/automc/run_preflight masses/hit-frequencies masses/plugins/ masses/plugins/01_rule_timing.cf masses/plugins/HitFreqsRuleTiming.pm masses/rule-qa/corpus-hourly
Author: jm
Date: Tue Nov 1 20:16:37 2005
New Revision: 330188

URL: http://svn.apache.org/viewcvs?rev=330188&view=rev
Log:
add rule timing to hit-frequencies, -T switch. Thanks to John Gardiner Myers for the code

Added:
spamassassin/trunk/masses/plugins/
spamassassin/trunk/masses/plugins/01_rule_timing.cf
spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm
Modified:
spamassassin/trunk/MANIFEST
spamassassin/trunk/build/automc/buildbot_ready
spamassassin/trunk/build/automc/run_preflight
spamassassin/trunk/masses/hit-frequencies
spamassassin/trunk/masses/rule-qa/corpus-hourly

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Tue Nov 1 20:16:37 2005
@@ -140,6 +140,8 @@
masses/overlap
masses/parse-rules-for-masses
masses/perceptron.c
+masses/plugins/01_rule_timing.cf
+masses/plugins/HitFreqsRuleTiming.pm
masses/post-ga-analysis.pl
masses/remove-ids-from-mclog
masses/rewrite-cf-with-new-scores

Modified: spamassassin/trunk/build/automc/buildbot_ready
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/automc/buildbot_ready?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/build/automc/buildbot_ready (original)
+++ spamassassin/trunk/build/automc/buildbot_ready Tue Nov 1 20:16:37 2005
@@ -84,7 +84,7 @@
chdir("masses") or die;

print "FAST FREQS REPORT:\n\n";
- system ("$perl hit-frequencies -c tstrules -x -p -s 0");
+ system ("$perl hit-frequencies -c tstrules -x -p -T -s 0");

print "\n\nBUILDING SLOW FREQS REPORT:\n\n";


Modified: spamassassin/trunk/build/automc/run_preflight
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/automc/run_preflight?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/build/automc/run_preflight (original)
+++ spamassassin/trunk/build/automc/run_preflight Tue Nov 1 20:16:37 2005
@@ -8,8 +8,6 @@
die "no perl path found in ARGV!";
}

-chdir "masses" or die;
-
my $slavename = "generic";

my $pwd = `pwd`;
@@ -22,11 +20,16 @@
#
system ("renice +19 $$");

-# just the sandbox rules
+# cd to masses
+#
+chdir "masses" or die;
+
+# just the sandbox rules and the timing plugin
#
system ("rm -rf tstrules");
run "mkdir tstrules";
run "cp ../rules/70_sandbox.cf tstrules";
+run "cp plugins/*.* tstrules";

# this is run in a chroot jail, just in case there's hostile
# rule code in there...

Modified: spamassassin/trunk/masses/hit-frequencies
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/hit-frequencies?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/masses/hit-frequencies (original)
+++ spamassassin/trunk/masses/hit-frequencies Tue Nov 1 20:16:37 2005
@@ -19,16 +19,16 @@
use strict;
use FindBin;
use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:io");
+getopts("fm:M:X:l:L:pxhc:at:s:ioT");

use vars qw {
$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
- $opt_a $opt_t $opt_s $opt_i $sorting $opt_o
+ $opt_a $opt_t $opt_s $opt_i $sorting $opt_o $opt_T
};

sub usage {
die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
- [-s SC] [-a] [-p] [-x] [-i] [-o] [spam log] [ham log]
+ [-s SC] [-a] [-p] [-x] [-i] [-T] [-o] [spam log] [ham log]

-c p use p as the rules directory
-f falses. count only false-negative or false-positive matches
@@ -43,6 +43,7 @@
-x extended output, with S/O ratio and scores
-s SC which scoreset to use
-i use IG (information gain) for ranking
+ -T display rule times. implies -x, -p
-o display hit overlaps against all other rules

options -l and -L are mutually exclusive.
@@ -80,6 +81,8 @@
my %ranking = ();
my $ok_lang = '';

+my %rule_times = ();
+
readscores($cffile);

$ok_lang = lc ($opt_l || $opt_L || '');
@@ -111,13 +114,29 @@
my $sorting = $opt_i ? "IG" : "RANK";

if ($opt_p) {
- if ($opt_f) {
- printf "%7s %7s %7s %6s %6s %6s %s\n",
- "OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
- } else {
- printf "%7s %7s %7s %6s %6s %6s %s\n",
- "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
- }
+ printf "%7s %7s %7s %6s %6s %6s %s\n",
+ "MSECS", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
+ "S/O", $sorting, "SCORE", "NAME";
+
+ printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
+ 0, $hdr_spam, $hdr_ham,
+ soratio ($num_spam,$num_ham), 0, 0;
+
+ $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements
+ $hdr_spam = ($num_spam / $hdr_all) * 100.0;
+ $hdr_ham = ($num_ham / $hdr_all) * 100.0;
+ $hdr_all = 100.0; # this is obvious
+
+ printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
+ 0, $hdr_spam, $hdr_ham,
+ soratio ($num_spam,$num_ham), 0, 0;
+
+}
+elsif ($opt_p) {
+ printf "%8s %7s %7s %6s %6s %6s %s\n",
+ "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
+ "S/O", $sorting, "SCORE", "NAME";
+
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
@@ -131,7 +150,8 @@
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;

-} elsif ($opt_x) {
+}
+elsif ($opt_x) {
printf "%7s %7s %7s %6s %6s %6s %s\n",
"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
@@ -291,6 +311,10 @@
}
}

+if ($opt_T) {
+ read_timings();
+}
+
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
next unless (exists $rules{$test}); # only valid tests
next if (!$opt_a && $rules{$test}->{issubrule});
@@ -333,7 +357,13 @@
$soratio{$test} = soratio ($fsadj, $fnadj);
}

- if ($opt_p) {
+ if ($opt_T) {
+ printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",
+ $rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test},
+ $scores{$test}||0,
+ $test;
+
+ } elsif ($opt_p) {
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",
$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $test;

@@ -570,5 +600,25 @@
} else {
return 0.5; # no results -> not effective
}
+}
+
+sub read_timings {
+ if (!open (IN, "<timing.log")) {
+ warn "hit-frequencies: cannot read 'timing.log', timings will be 0";
+ return;
+ }
+ my $ver = <IN>;
+ if ($ver !~ /^v1/) {
+ warn "hit-frequencies: unknown version in 'timing.log', timings will be 0";
+ close IN;
+ return;
+ }
+ while (<IN>) {
+ if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
+ my ($name, $duration, $max, $runs) = ($1,$2,$3,$4);
+ $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000;
+ }
+ }
+ close IN;
}


Added: spamassassin/trunk/masses/plugins/01_rule_timing.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/plugins/01_rule_timing.cf?rev=330188&view=auto
==============================================================================
--- spamassassin/trunk/masses/plugins/01_rule_timing.cf (added)
+++ spamassassin/trunk/masses/plugins/01_rule_timing.cf Tue Nov 1 20:16:37 2005
@@ -0,0 +1,22 @@
+# config file to load the timing plugin
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+###########################################################################
+
+loadplugin HitFreqsRuleTiming HitFreqsRuleTiming.pm
+

Added: spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm?rev=330188&view=auto
==============================================================================
--- spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm (added)
+++ spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm Tue Nov 1 20:16:37 2005
@@ -0,0 +1,102 @@
+# HitFreqsRuleTiming - SpamAssassin rule timing plugin
+# (derived from attachment 3055 on bug 4517)
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+package HitFreqsRuleTiming;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+use strict;
+use warnings;
+
+use Time::HiRes qw(gettimeofday tv_interval);
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ $mailsaobject->{rule_timing} = {
+ duration => { },
+ runs => { },
+ max => { },
+ };
+ bless ($self, $class);
+}
+
+sub start_rules {
+ my ($self, $options) = @_;
+
+ $options->{permsgstatus}->{RuleTimingStart} = [gettimeofday()];
+}
+
+sub ran_rule {
+ my @now = gettimeofday();
+ my ($self, $options) = @_;
+
+ my $permsg = $options->{permsgstatus};
+ my $mailsa = $permsg->{main};
+ my $name = $options->{rulename};
+
+ my $duration = tv_interval($permsg->{RuleTimingStart}, \@now);
+ @{$permsg->{RuleTimingStart}} = @now;
+
+ unless ($mailsa->{rule_timing}{duration}{$name}) {
+ $mailsa->{rule_timing}{duration}{$name} = 0;
+ $mailsa->{rule_timing}{max}{$name} = 0;
+ }
+
+ # TODO: record all runs and compute std dev
+
+ $mailsa->{rule_timing}{runs}{$name}++;
+ $mailsa->{rule_timing}{duration}{$name} += $duration;
+ $mailsa->{rule_timing}{max}{$name} = $duration
+ if $duration > $mailsa->{rule_timing}{max}{$name};
+}
+
+sub finish {
+ my $self = shift;
+ my $mailsa = $self->{main};
+
+ # take a ref to speed up the sorting
+ my $dur_ref = $mailsa->{rule_timing}{duration};
+
+ my $s = '';
+ foreach my $rule (sort {
+ $dur_ref->{$b} <=> $dur_ref->{$a}
+ } keys %{$dur_ref})
+ {
+ $s .= sprintf "T %30s %8.3f %8.3f %4d\n", $rule,
+ $mailsa->{rule_timing}{duration}->{$rule},
+ $mailsa->{rule_timing}{max}->{$rule},
+ $mailsa->{rule_timing}{runs}->{$rule};
+ }
+
+ open (OUT, ">timing.log") or warn "cannot write to timing.log";
+ print OUT "v1\n"; # forward compatibility
+ print OUT $s;
+ close OUT or warn "cannot write to timing.log";
+
+ $self->SUPER::finish();
+}
+
+1;

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/rule-qa/corpus-hourly?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Tue Nov 1 20:16:37 2005
@@ -386,14 +386,14 @@
next unless $ham{$user};
system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
- open(IN, "./hit-frequencies -xpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
+ open(IN, "./hit-frequencies -Txpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
while(<IN>) {
chomp;
push @output, "$_:$user\n";
}
close(IN);
}
- open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+ open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
push @output, $_;
}
@@ -421,7 +421,7 @@
}
# print out by age
chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
- open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+ open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
chomp;
push @output, "$_:$which\n";
@@ -436,7 +436,7 @@
system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");

chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
- open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+ open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) { print(OUT); }
close(IN);
}