Mailing List Archive

svn commit: rev 36119 - spamassassin/trunk/masses
Author: hstern
Date: Mon Aug 9 10:17:47 2004
New Revision: 36119

Added:
spamassassin/trunk/masses/model-statistics (contents, props changed)
Modified:
spamassassin/trunk/masses/validate-model
Log:
* validate-model
* model-statistics
Added a display of some statistics about the generated model (mean error
rates, etc.)



Added: spamassassin/trunk/masses/model-statistics
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/model-statistics Mon Aug 9 10:17:47 2004
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+
+# This script is used to print some statistics about classification accuracy
+# with a k-fold cross validation
+
+use strict;
+
+my $lambda = 50; # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 1 ) {
+ print STDERR "Usage: model-statistics [validate]\n";
+ exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+ my @x = split(/\s+/);
+ push (@fp1, $x[2] / ($x[0] + $x[2]));
+ push (@fn1, $x[3] / ($x[1] + $x[3]));
+ push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1);
+stat_analysis ("False negatives", "pct", \@fn1);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);
+
+sub stat_analysis {
+ my $title = shift;
+ my $pct = shift;
+ my $s1 = shift;
+
+ # This is the number of degrees of freedom of the two sample sets (i.e.
+ # the number of samples in each set).
+ my $dof = scalar(@{$s1});
+
+ # Compute the mean and standard deviation of the first sample
+ # mean = 1/n * sum(s[i])
+ my $mean_s1 = 0;
+ foreach my $i (1..$dof) {
+ $mean_s1 += $$s1[$i];
+ }
+ $mean_s1 /= $dof;
+
+ # var = 1/(n-1) * sum((mean - s[i])^2)
+ my $var_s1 = 0;
+ foreach my $i (1..$dof) {
+ $var_s1 += ($mean_s1 - $$s1[$i])**2;
+ }
+ $var_s1 /= $dof - 1;
+
+ # std = sqrt(var)
+ my $std_s1 = sqrt($var_s1);
+
+ # SA developers like percentage points instead of probabilities.
+ if ( $pct eq "pct" ) {
+ printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
+ } else {
+ printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
+ }
+}

Modified: spamassassin/trunk/masses/validate-model
==============================================================================
--- spamassassin/trunk/masses/validate-model (original)
+++ spamassassin/trunk/masses/validate-model Mon Aug 9 10:17:47 2004
@@ -123,4 +123,6 @@

./extract-results $LOGDIR/validate.* > $LOGDIR/validate

+./model-statistics $LOGDIR/validate
+
exit 0