Mailing List Archive

svn commit: r171037 - in /spamassassin/trunk/masses/bayes-testing: README bayes-10pcv-driver bayes-static-thresholds bayes-thresholds draw-bayes-histogram graph-bayes-histogram
Author: jm
Date: Thu May 19 19:30:40 2005
New Revision: 171037

URL: http://svn.apache.org/viewcvs?rev=171037&view=rev
Log:
updated bayes-testing code to work again; add GNUplot graphing script

Added:
spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram (with props)
Modified:
spamassassin/trunk/masses/bayes-testing/README
spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver
spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds
spamassassin/trunk/masses/bayes-testing/bayes-thresholds
spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram

Modified: spamassassin/trunk/masses/bayes-testing/README
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/README?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/README (original)
+++ spamassassin/trunk/masses/bayes-testing/README Thu May 19 19:30:40 2005
@@ -49,6 +49,7 @@

Then split the test corpus into folds:

+ mkdir -p cor/ham cor/spam
$SADIR/tools/split_corpora -n 10 -p cor/ham/bucket ch
$SADIR/tools/split_corpora -n 10 -p cor/spam/bucket cs


Modified: spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver Thu May 19 19:30:40 2005
@@ -28,9 +28,6 @@
# this, since bayes will not be activated without 200 messages in the db,
# and each fold is run using 10% of the corpus -- and 2000/10 = 200.

-# CHANGE ME: the path to the version of SpamAssassin you are testing.
-SADIR=/home/jm/ftp/spamassassin
-
###########################################################################

testdir=`pwd`
@@ -58,12 +55,11 @@
echo "

bayes_path $tmpdir/dbs/bayes
-bayes_use_chi2_combining 1
bayes_auto_learn 0
+bayes_min_ham_num 10
+bayes_min_spam_num 10

" > $tmpdir/rules/30bayes_path.cf
-# bayes_expiry_use_scan_count 0
-# bayes_expiry_scan_count 500
mkdir $tmpdir/dbs

INTERLEAVE_TESTS=0
@@ -89,14 +85,14 @@

(
echo -n "Learning from all ham buckets..." ; date
- time sa-learn --ham --randseed=1 --no-rebuild $learnargs \
+ time sa-learn --ham --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*

echo -n "Learning from all spam buckets..." ; date
- time sa-learn --spam --randseed=1 --no-rebuild $learnargs \
+ time sa-learn --spam --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*

- time sa-learn --rebuild $learnargs --config-file=$tmpdir/rules
+ time sa-learn --sync $learnargs --config-file=$tmpdir/rules

echo -n "Done learning. " ; date
) 2>&1 | tee $results/learn.log
@@ -151,21 +147,21 @@

else
echo "Learning contents of learn ham bucket..."
- time sa-learn --ham --randseed=1 --no-rebuild $learnargs \
+ time sa-learn --ham --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn

echo "Learning contents of learn spam bucket..."
- time sa-learn --spam --randseed=1 --no-rebuild $learnargs \
+ time sa-learn --spam --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn

- time sa-learn --rebuild $learnargs --config-file=$tmpdir/rules
+ time sa-learn --sync $learnargs --config-file=$tmpdir/rules

echo "Dumping bayes DB..."
( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
> $rdir/bayes_db.dump
fi

- time sa-learn --rebuild --config-file=$tmpdir/rules
+ time sa-learn --sync --config-file=$tmpdir/rules

if [ $INTERLEAVE_TESTS = 1 ] ; then
# now split the ham and spam test bucket into 10 sub-buckets,

Modified: spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds Thu May 19 19:30:40 2005
@@ -9,8 +9,8 @@
my $spam = $ARGV[0] || "spam.log";
my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");

-my $hamcutoff = 0.30;
-my $spamcutoff = 0.70;
+my $hamcutoff = 0.20;
+my $spamcutoff = 0.80;

my $nbuckets = 50;
my $range_lo = 0.00;
@@ -46,7 +46,7 @@
my $isspam = 0; ($file eq $spam) and $isspam = 1;

while (<IN>) {
- /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+ /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
if ($score == 1) { $score = 0.9999999999999; }


Modified: spamassassin/trunk/masses/bayes-testing/bayes-thresholds
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-thresholds?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-thresholds (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-thresholds Thu May 19 19:30:40 2005
@@ -42,7 +42,7 @@
my $isspam = 0; ($file eq $spam) and $isspam = 1;

while (<IN>) {
- /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+ /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;

my $bucket_id;

Modified: spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram (original)
+++ spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram Thu May 19 19:30:40 2005
@@ -48,7 +48,7 @@
my $isspam = 0; ($file eq $spam) and $isspam = 1;

while (<IN>) {
- /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+ /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;

my $bucket_id;

Added: spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram?rev=171037&view=auto
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram (added)
+++ spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram Thu May 19 19:30:40 2005
@@ -0,0 +1,125 @@
+#!/usr/bin/perl -w
+#
+# Given a 'results' dir from a bayes-10pcv-driver run,
+# graph a histogram of the score ranges using GNUPlot.
+#
+# usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ...
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+
+use Getopt::Long;
+use vars qw($opt_buckets);
+
+GetOptions("buckets=i");
+
+my $buckets = $opt_buckets || 100;
+my $range_lo = 0.0;
+my $range_hi = 1.0;
+
+%bux_sp = ();
+%bux_ns = ();
+
+my $step = ($range_hi - $range_lo) / $buckets;
+my $i;
+for ($i = $range_lo; $i <= $range_hi; $i += $step) {
+ push (@buckets, $i);
+}
+
+open(DATA, ">plot.data");
+my $setcount = 0;
+my %tag = ();
+my @dirs = ();
+foreach my $dir (@ARGV) {
+ for ($i = $range_lo; $i <= $range_hi; $i += $step) {
+ $bux_ns{$i} = $bux_sp{$i} = 0;
+ }
+
+ dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
+ push (@dirs, $dir);
+ $tag{$dir} = $setcount;
+ $setcount++;
+}
+close DATA;
+
+open (OUT, "| gnuplot -") or die "cannot run gnuplot";
+select(OUT);
+
+print "
+set xlabel 'P(spam)'
+set ylabel 'Frequency'
+set logscale y 2
+set xrange [0.0:1.01]
+set yrange []
+set xtics 0,0.1,0.99
+set terminal png crop
+set out 'graph.png'
+
+plot ";
+
+my @text = ();
+my $t = 0;
+foreach my $dir (@dirs) {
+ my $s = $tag{$dir};
+ $t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
+ $t++; push (@text, " 'plot.data' using 1:3 index $s with linesp lt $t pt $t t 'spam, $dir'");
+}
+
+print join(", \\\n", @text);
+print "\n";
+
+close OUT;
+exit;
+
+
+sub dofile {
+ my ($setcount, $spam, $nonspam) = @_;
+
+ foreach my $file ($spam, $nonspam) {
+ open (IN, "<$file") || die "Could not open file '$file': $!";
+
+ my $isspam = 0; ($file eq $spam) and $isspam = 1;
+
+ while (<IN>) {
+ /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
+ my $score = $2+0;
+
+ my $bucket_id;
+ foreach my $bucket (@buckets) {
+ if ($score >= $bucket && $score < $bucket+$step) {
+ $bucket_id = $bucket; last;
+ }
+ }
+
+ if ($isspam) {
+ $bux_sp{$bucket_id}++;
+ } else {
+ $bux_ns{$bucket_id}++;
+ }
+ }
+ }
+
+ my $sideoffset = 0.001*$setcount;
+ foreach my $bucket (@buckets) {
+ my $ns = $bux_ns{$bucket};
+ my $sp = $bux_sp{$bucket};
+ my $xpos = $bucket + $sideoffset;
+ print DATA "$xpos $ns $sp\n";
+ }
+ print DATA "\n\n";
+}
+

Propchange: spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram
------------------------------------------------------------------------------
svn:executable = *