Mailing List Archive

svn commit: r160803 - in spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Bayes.pm lib/Mail/SpamAssassin/Bayes/ lib/Mail/SpamAssassin/Bayes/CombineChi.pm lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm lib/Mail/SpamAssassin/Conf.pm
Author: jm
Date: Sun Apr 10 13:44:04 2005
New Revision: 160803

URL: http://svn.apache.org/viewcvs?view=rev&rev=160803
Log:
bug 3842: inactivate support for naive-Bayes probability combining, by abstracting into a new separate implementation class

Added:
spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/
spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
Modified:
spamassassin/trunk/MANIFEST
spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Sun Apr 10 13:44:04 2005
@@ -28,6 +28,8 @@
lib/Mail/SpamAssassin/ArchiveIterator.pm
lib/Mail/SpamAssassin/AutoWhitelist.pm
lib/Mail/SpamAssassin/Bayes.pm
+lib/Mail/SpamAssassin/Bayes/CombineChi.pm
+lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
lib/Mail/SpamAssassin/BayesStore.pm
lib/Mail/SpamAssassin/BayesStore/DBM.pm
lib/Mail/SpamAssassin/BayesStore/MySQL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Sun Apr 10 13:44:04 2005
@@ -56,6 +56,11 @@

use Mail::SpamAssassin;
use Mail::SpamAssassin::PerMsgStatus;
+
+# pick ONLY ONE of these combining implementations.
+use Mail::SpamAssassin::Bayes::CombineChi;
+# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
+
use Digest::SHA1 qw(sha1 sha1_hex);

use vars qw{
@@ -206,23 +211,6 @@
# into the <0.5 range for nonspam and >0.5 for spam.
use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;

-# Value for 'x' in the f(w) equation.
-# "Let x = the number used when n [hits] is 0."
-use constant CHI_ROBINSON_X_CONSTANT => 0.538;
-use constant GARY_ROBINSON_X_CONSTANT => 0.600;
-
-# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
-# the use of "s") of an original assumed expectation ... relative to how
-# strongly we want to consider our actual collected data." Low 's' means
-# trust collected data more strongly.
-use constant CHI_ROBINSON_S_CONSTANT => 0.100;
-use constant GARY_ROBINSON_S_CONSTANT => 0.160;
-
-# Should we ignore tokens with probs very close to the middle ground (.5)?
-# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
-use constant CHI_ROBINSON_MIN_PROB_STRENGTH => 0.346;
-use constant GARY_ROBINSON_MIN_PROB_STRENGTH => 0.430;
-
# How many of the most significant tokens should we use for the p(w)
# calculation?
use constant N_SIGNIFICANT_TOKENS => 150;
@@ -270,8 +258,6 @@
$self;
}

-###########################################################################
-
sub finish {
my $self = shift;
#if (!$self->{conf}->{use_bayes}) { return; }
@@ -282,6 +268,8 @@
$self->{store}->untie_db();
}

+sub sa_die { Mail::SpamAssassin::sa_die(@_); }
+
###########################################################################

sub sanity_check_is_untied {
@@ -306,25 +294,6 @@
# use of hapaxes. Set on bayes object, since it controls prob
# computation.
$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
-
- # Use chi-squared combining instead of Gary-combining (Robinson/Graham-style
- # naive-Bayesian)?
- $self->{use_chi_sq_combining} = $self->{conf}->{bayes_use_chi2_combining};
-
- # Use the appropriate set of constants; the different systems have different
- # optimum settings for these. (TODO: should these be exposed through Conf?)
- if ($self->{use_chi_sq_combining}) {
- $self->{robinson_x_constant} = CHI_ROBINSON_X_CONSTANT;
- $self->{robinson_s_constant} = CHI_ROBINSON_S_CONSTANT;
- $self->{robinson_min_prob_strength} = CHI_ROBINSON_MIN_PROB_STRENGTH;
- } else {
- $self->{robinson_x_constant} = GARY_ROBINSON_X_CONSTANT;
- $self->{robinson_s_constant} = GARY_ROBINSON_S_CONSTANT;
- $self->{robinson_min_prob_strength} = GARY_ROBINSON_MIN_PROB_STRENGTH;
- }
-
- $self->{robinson_s_times_x} =
- ($self->{robinson_x_constant} * $self->{robinson_s_constant});
}

###########################################################################
@@ -1089,9 +1058,9 @@
# use Robinson's f(x) equation for low-n tokens, instead of just
# ignoring them
my $robn = $s+$n;
- $prob = ($self->{robinson_s_times_x} + ($robn * $prob))
+ $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
/
- ($self->{robinson_s_constant} + $robn);
+ ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
}

if ($self->{log_raw_counts}) {
@@ -1121,16 +1090,17 @@
if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}

return 0 if $Ns == 0 || $Nn == 0;
- return 0 if abs( $prob - 0.5 ) < $self->{robinson_min_prob_strength};
+ return 0 if abs( $prob - 0.5 ) <
+ $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
- my $p = 0.5 - $self->{robinson_min_prob_strength};
+ my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;

- my $s = $self->{robinson_s_constant};
- my $sx = $self->{robinson_s_times_x};
+ my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
+ my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
my $a = $Nb * ( 1 - $p );
my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
@@ -1261,7 +1231,8 @@
{
if ($count-- < 0) { last; }
my $pw = $pw{$_}->{prob};
- next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
+ next if (abs($pw - 0.5) <
+ $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);

# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
@@ -1289,11 +1260,7 @@
goto skip;
}

- if ($self->{use_chi_sq_combining}) {
- $score = chi_squared_probs_combine ($ns, $nn, @sorted);
- } else {
- $score = robinson_naive_bayes_probs_combine (@sorted);
- }
+ $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);

# Couldn't come up with a probability?
goto skip unless defined $score;
@@ -1391,92 +1358,6 @@
}

return;
-}
-
-###########################################################################
-
-sub sa_die { Mail::SpamAssassin::sa_die(@_); }
-
-###########################################################################
-
-sub robinson_naive_bayes_probs_combine {
- my (@sorted) = @_;
-
- my $wc = scalar @sorted;
- return unless $wc;
-
- my $P = 1;
- my $Q = 1;
-
- foreach my $pw (@sorted) {
- $P *= (1-$pw);
- $Q *= $pw;
- }
- $P = 1 - ($P ** (1 / $wc));
- $Q = 1 - ($Q ** (1 / $wc));
- return (1 + ($P - $Q) / ($P + $Q)) / 2.0;
-}
-
-###########################################################################
-
-# Chi-squared function
-sub chi2q {
- my ($x2, $v) = @_;
-
- die "bayes: v must be even in chi2q(x2, v)" if $v & 1;
- my $m = $x2 / 2.0;
- my ($sum, $term);
- $sum = $term = exp(0 - $m);
- for my $i (1 .. (($v/2)-1)) {
- $term *= $m / $i;
- $sum += $term;
- }
- return $sum < 1.0 ? $sum : 1.0;
-}
-
-# Chi-Squared method. Produces mostly boolean $result,
-# but with a grey area.
-sub chi_squared_probs_combine {
- my ($ns, $nn, @sorted) = @_;
- # @sorted contains an array of the probabilities
- my $wc = scalar @sorted;
- return unless $wc;
-
- my ($H, $S);
- my ($Hexp, $Sexp);
- $Hexp = $Sexp = 0;
-
- # see bug 3118
- my $totmsgs = ($ns + $nn);
- if ($totmsgs == 0) { return; }
- $S = ($ns / $totmsgs);
- $H = ($nn / $totmsgs);
-
- use POSIX qw(frexp);
-
- foreach my $prob (@sorted) {
- $S *= 1.0 - $prob;
- $H *= $prob;
- if ($S < 1e-200) {
- my $e;
- ($S, $e) = frexp($S);
- $Sexp += $e;
- }
- if ($H < 1e-200) {
- my $e;
- ($H, $e) = frexp($H);
- $Hexp += $e;
- }
- }
-
- use constant LN2 => log(2);
-
- $S = log($S) + $Sexp * LN2;
- $H = log($H) + $Hexp * LN2;
-
- $S = 1.0 - chi2q(-2.0 * $S, 2 * $wc);
- $H = 1.0 - chi2q(-2.0 * $H, 2 * $wc);
- return (($S - $H) + 1.0) / 2.0;
}

###########################################################################

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm?view=auto&rev=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm Sun Apr 10 13:44:04 2005
@@ -0,0 +1,120 @@
+# Chi-square probability combining and related constants.
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+# this package is a no-op; the real impl code is in another pkg.
+package Mail::SpamAssassin::Bayes::CombineChi; 1;
+
+# Force into another package, so our symbols will appear in that namespace with
+# no indirection, for speed. Other combiners must do the same, since Bayes.pm
+# uses this namespace directly. This means only one combiner can be loaded at
+# any time.
+package Mail::SpamAssassin::Bayes::Combine;
+
+use strict;
+use warnings;
+use bytes;
+
+use POSIX qw(frexp);
+use constant LN2 => log(2);
+
+# Value for 'x' in Gary Robinson's f(w) equation.
+# "Let x = the number used when n [hits] is 0."
+our $FW_X_CONSTANT = 0.538;
+
+# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
+# the use of "s") of an original assumed expectation ... relative to how
+# strongly we want to consider our actual collected data." Low 's' means
+# trust collected data more strongly.
+our $FW_S_CONSTANT = 0.100;
+
+# (s . x) for the f(w) equation.
+our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
+
+# Should we ignore tokens with probs very close to the middle ground (.5)?
+# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
+our $MIN_PROB_STRENGTH = 0.346;
+
+###########################################################################
+
+# Chi-Squared method. Produces mostly boolean $result,
+# but with a grey area.
+sub combine {
+ my ($ns, $nn, $sortedref) = @_;
+
+ # @$sortedref contains an array of the probabilities
+ my $wc = scalar @$sortedref;
+ return unless $wc;
+
+ my ($H, $S);
+ my ($Hexp, $Sexp);
+ $Hexp = $Sexp = 0;
+
+ # see bug 3118
+ my $totmsgs = ($ns + $nn);
+ if ($totmsgs == 0) { return; }
+ $S = ($ns / $totmsgs);
+ $H = ($nn / $totmsgs);
+
+ foreach my $prob (@$sortedref) {
+ $S *= 1.0 - $prob;
+ $H *= $prob;
+ if ($S < 1e-200) {
+ my $e;
+ ($S, $e) = frexp($S);
+ $Sexp += $e;
+ }
+ if ($H < 1e-200) {
+ my $e;
+ ($H, $e) = frexp($H);
+ $Hexp += $e;
+ }
+ }
+
+ $S = log($S) + $Sexp * LN2;
+ $H = log($H) + $Hexp * LN2;
+
+ # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
+ # fn then just used ($v/2) internally! changed to simply supply $wc as
+ # ($halfv) directly instead to avoid redundant doubling and halving. The
+ # side-effect is that chi2q() uses a different API now, but it's only used
+ # here anyway.
+
+ $S = 1.0 - chi2q(-2.0 * $S, $wc);
+ $H = 1.0 - chi2q(-2.0 * $H, $wc);
+ return (($S - $H) + 1.0) / 2.0;
+}
+
+# Chi-squared function (API changed; see comment above)
+sub chi2q {
+ my ($x2, $halfv) = @_;
+
+ my $m = $x2 / 2.0;
+ my ($sum, $term);
+ $sum = $term = exp(0 - $m);
+
+ # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
+ # array, with a plain C-style for loop
+ my $i;
+ for ($i = 1; $i < $halfv; $i++) {
+ $term *= $m / $i;
+ $sum += $term;
+ }
+ return $sum < 1.0 ? $sum : 1.0;
+}
+
+1;

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm?view=auto&rev=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm Sun Apr 10 13:44:04 2005
@@ -0,0 +1,73 @@
+# Naive-Bayesian-style probability combining and related constants.
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+# this package is a no-op; the real impl code is in another pkg.
+package Mail::SpamAssassin::Bayes::CombineNaiveBayes; 1;
+
+# Force into another package, so our symbols will appear in that namespace with
+# no indirection, for speed. Other combiners must do the same, since Bayes.pm
+# uses this namespace directly. This means only one combiner can be loaded at
+# any time.
+package Mail::SpamAssassin::Bayes::Combine;
+
+use strict;
+use warnings;
+use bytes;
+
+###########################################################################
+
+# Value for 'x' in Gary Robinson's f(w) equation.
+# "Let x = the number used when n [hits] is 0."
+our $FW_X_CONSTANT = 0.600;
+
+# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
+# the use of "s") of an original assumed expectation ... relative to how
+# strongly we want to consider our actual collected data." Low 's' means
+# trust collected data more strongly.
+our $FW_S_CONSTANT = 0.160;
+
+# (s . x) for the f(w) equation.
+our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
+
+# Should we ignore tokens with probs very close to the middle ground (.5)?
+# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
+our $MIN_PROB_STRENGTH = 0.430;
+
+###########################################################################
+
+# Combine probabilities using Gary Robinson's naive-Bayesian-style
+# combiner
+sub combine {
+ my ($ns, $nn, $sortedref) = @_;
+
+ my $wc = scalar @$sortedref;
+ return unless $wc;
+
+ my $P = 1;
+ my $Q = 1;
+
+ foreach my $pw (@$sortedref) {
+ $P *= (1-$pw);
+ $Q *= $pw;
+ }
+ $P = 1 - ($P ** (1 / $wc));
+ $Q = 1 - ($Q ** (1 / $wc));
+ return (1 + ($P - $Q) / ($P + $Q)) / 2.0;
+}
+
+1;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Sun Apr 10 13:44:04 2005
@@ -1208,21 +1208,6 @@
type => $CONF_TYPE_BOOL
});

-=item bayes_use_chi2_combining (default: 1)
-
-Should the Bayesian classifier use chi-squared combining, instead of
-Robinson/Graham-style naive Bayesian combining? Chi-squared produces
-more 'extreme' output results, but may be more resistant to changes
-in corpus size etc.
-
-=cut
-
- push (@cmds, {
- setting => 'bayes_use_chi2_combining',
- default => 1,
- type => $CONF_TYPE_BOOL
- });
-
=item bayes_journal_max_size (default: 102400)

SpamAssassin will opportunistically sync the journal and the database.