Mailing List Archive

svn commit: rev 6353 - in incubator/spamassassin/trunk: . lib/Mail/SpamAssassin rules sql t tools
Author: jm
Date: Thu Jan 29 18:54:33 2004
New Revision: 6353

Added:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
incubator/spamassassin/trunk/sql/README.awl
incubator/spamassassin/trunk/sql/README.bayes
incubator/spamassassin/trunk/sql/awl_mysql.sql
incubator/spamassassin/trunk/sql/bayes_mysql.sql
incubator/spamassassin/trunk/sql/bayes_pg.sql
incubator/spamassassin/trunk/sql/bayes_sqlite.sql
incubator/spamassassin/trunk/t/bayesdbm.t (contents, props changed)
incubator/spamassassin/trunk/t/bayessql.t (contents, props changed)
incubator/spamassassin/trunk/t/sql_based_whitelist.t (contents, props changed)
incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql
incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql
Modified:
incubator/spamassassin/trunk/INSTALL
incubator/spamassassin/trunk/MANIFEST
incubator/spamassassin/trunk/MANIFEST.SKIP
incubator/spamassassin/trunk/Makefile.PL
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
incubator/spamassassin/trunk/sql/README
incubator/spamassassin/trunk/t/SATest.pm
Log:
bug 195: SQL support for AWL and Bayes storage, thanks to Michael Parker

Modified: incubator/spamassassin/trunk/INSTALL
==============================================================================
--- incubator/spamassassin/trunk/INSTALL (original)
+++ incubator/spamassassin/trunk/INSTALL Thu Jan 29 18:54:33 2004
@@ -400,8 +400,15 @@
- Time::HiRes (from CPAN)

If this module is installed, the processing times are logged more
- exactly by spamd.
+ precisely by spamd.

+
+ - DBI *and* DBD driver/modules for your database (from CPAN)
+
+ If you intend to use SpamAssassin with an SQL database backend for
+ user configuration data, Bayes storage, or AWL storage, you will need
+ to have these installed; both the basic DBI module and the driver for
+ your database.


What Next?

Modified: incubator/spamassassin/trunk/MANIFEST
==============================================================================
--- incubator/spamassassin/trunk/MANIFEST (original)
+++ incubator/spamassassin/trunk/MANIFEST Thu Jan 29 18:54:33 2004
@@ -27,6 +27,8 @@
lib/Mail/SpamAssassin/AutoWhitelist.pm
lib/Mail/SpamAssassin/Bayes.pm
lib/Mail/SpamAssassin/BayesStore.pm
+lib/Mail/SpamAssassin/BayesStoreDBM.pm
+lib/Mail/SpamAssassin/BayesStoreSQL.pm
lib/Mail/SpamAssassin/CmdLearn.pm
lib/Mail/SpamAssassin/Conf.pm
lib/Mail/SpamAssassin/ConfSourceSQL.pm
@@ -46,6 +48,7 @@
lib/Mail/SpamAssassin/Received.pm
lib/Mail/SpamAssassin/Reporter.pm
lib/Mail/SpamAssassin/SHA1.pm
+lib/Mail/SpamAssassin/SQLBasedAddrList.pm
lib/Mail/SpamAssassin/TextCat.pm
lib/Mail/SpamAssassin/UnixLocker.pm
lib/Mail/SpamAssassin/Util.pm
@@ -226,6 +229,7 @@
t/spamd_report.t
t/spamd_report_ifspam.t
t/spamd_stop.t
+t/sql_based_whitelist.t
t/spamd_symbols.t
t/spamd_unix.t
t/spamd_utf8.t
@@ -238,6 +242,8 @@
t/zz_cleanup.t
tools/README.speedtest
tools/check_whitelist
+tools/convert_awl_dbm_to_sql
+tools/convert_bayes_dbm_to_sql
tools/mboxsplit
tools/sa-stats.pl
tools/speedtest
@@ -246,3 +252,38 @@
tools/triplets.pl
lib/Mail/SpamAssassin/Plugin.pm
lib/Mail/SpamAssassin/PluginHandler.pm
+lib/Mail/SpamAssassin/BayesStoreDBM.pm
+lib/Mail/SpamAssassin/BayesStoreSQL.pm
+lib/Mail/SpamAssassin/SQLBasedAddrList.pm
+sql/README.awl
+sql/README.bayes
+sql/awl_mysql.sql
+sql/bayes_mysql.sql
+sql/bayes_pg.sql
+sql/bayes_sqlite.sql
+t/bayesdbm.t
+t/bayessql.t
+t/sql_based_whitelist.t
+tools/convert_awl_dbm_to_sql
+tools/convert_bayes_dbm_to_sql
+t/data/whitelists/action.eff.org
+t/data/whitelists/mlist_mailman_message
+t/data/whitelists/amazon_co_uk_ship
+t/data/whitelists/amazon_com_ship
+t/data/whitelists/cert.org
+t/data/whitelists/debian_bts_reassign
+t/data/whitelists/linuxplanet
+t/data/whitelists/lp.org
+t/data/whitelists/media_unspun
+t/data/whitelists/mlist_yahoo_groups_message
+t/data/whitelists/mypoints
+t/data/whitelists/neat_net_tricks
+t/data/whitelists/netcenter-direct_de
+t/data/whitelists/oracle_net_techblast
+t/data/whitelists/orbitz.com
+t/data/whitelists/paypal.com
+t/data/whitelists/register.com_password
+t/data/whitelists/ryanairmail.com
+t/data/whitelists/sf.net
+t/data/whitelists/winxpnews.com
+t/data/whitelists/yahoo-inc.com

Modified: incubator/spamassassin/trunk/MANIFEST.SKIP
==============================================================================
--- incubator/spamassassin/trunk/MANIFEST.SKIP (original)
+++ incubator/spamassassin/trunk/MANIFEST.SKIP Thu Jan 29 18:54:33 2004
@@ -112,3 +112,5 @@
build/2.60_change_summary
build/replace_license_blocks
sa-learn
+t/bayessql.cf
+t/sql_based_whitelist.cf

Modified: incubator/spamassassin/trunk/Makefile.PL
==============================================================================
--- incubator/spamassassin/trunk/Makefile.PL (original)
+++ incubator/spamassassin/trunk/Makefile.PL Thu Jan 29 18:54:33 2004
@@ -214,8 +214,9 @@

'doc', 'pod2htm*',

- 't/do_net', 't/log',
+ 't/bayessql.cf', 't/do_net', 't/log', 't/sql_based_whitelist.cf',
)
+
},

'AUTHOR' => 'Justin Mason <jm@jmason.org>',
@@ -339,6 +340,54 @@
}
$makefile{'macro'}{'RUN_NET_TESTS'} = yesno($opt{'run_net_tests'});

+$opt{'run_awl_sql_tests'} = prompt('Run SQL Based AutoWhitelist Tests (additional information required) (y/n)', "n");
+print "\n";
+
+$opt{'run_awl_sql_tests'} = bool($opt{'run_awl_sql_tests'});
+if ($opt{'run_awl_sql_tests'}) {
+ my $user_awl_dsn = prompt("SQL AWL DSN (user_awl_dsn): ", "dbi:mysql:spamassassin:localhost");
+ my $user_awl_sql_username = prompt("SQL AWL DB username (user_awl_sql_username): ", "");
+ my $user_awl_sql_password = prompt("SQL AWL DB password (user_awl_sql_password): ", "");
+ my $user_awl_sql_table = prompt("SQL AWL tablename (user_awl_sql_table): ", "awl") || 'awl';
+ print "\n";
+
+ open(FILE, ">t/sql_based_whitelist.cf");
+ print FILE "user_awl_dsn $user_awl_dsn\n";
+ # These two can be blank and the conf parser doesn't really like
+ # blank variables, so do not print them if blank
+ print FILE "user_awl_sql_username $user_awl_sql_username\n" if ($user_awl_sql_username);
+ print FILE "user_awl_sql_password $user_awl_sql_password\n" if ($user_awl_sql_password);
+ print FILE "user_awl_sql_table $user_awl_sql_table\n";
+
+ close(FILE);
+}
+else {
+ unlink("t/sql_based_whitelist.cf");
+}
+
+$opt{'run_bayes_sql_tests'} = prompt("Run Bayes SQL storage tests (additional information required)? (y/n)", 'n');
+print "\n";
+
+$opt{'run_bayes_sql_tests'} = bool($opt{'run_bayes_sql_tests'});
+if ($opt{'run_bayes_sql_tests'}) {
+ my $bayes_sql_dsn = prompt("Bayes SQL DSN (bayes_sql_dsn): ", "dbi:mysql:spamassassin:localhost");
+ my $bayes_sql_username = prompt("Bayes SQL DB username (bayes_sql_username): ", "");
+ my $bayes_sql_password = prompt("Bayes SQL DB password (bayes_sql_password): ", "");
+ print "\n";
+
+ open(FILE, ">t/bayessql.cf");
+ print FILE "bayes_sql_dsn $bayes_sql_dsn\n";
+ # These two can be blank and the conf parser doesn't really like
+ # blank variables, so do not print them if blank
+ print FILE "bayes_sql_username $bayes_sql_username\n" if ($bayes_sql_username);
+ print FILE "bayes_sql_password $bayes_sql_password\n" if ($bayes_sql_password);
+ close(FILE);
+}
+else {
+ unlink("t/bayessql.cf");
+}
+
+#######################################################################

# Now dump the Makefile
WriteMakefile(%makefile);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm Thu Jan 29 18:54:33 2004
@@ -108,8 +108,11 @@
if (defined $noipent->{count} && $noipent->{count} > 0) {
dbg ("AWL: found entry w/o IP address for $addr: replacing with $origip");
$self->{checker}->remove_entry($noipent);
- $self->{entry} = $noipent;
- $self->{entry}->{addr} = $fulladdr;
+ # Now assign proper entry the count and totscore values of the no ip entry
+ # instead of assigning the whole value to avoid wiping out any information added
+ # to the previous entry.
+ $self->{entry}->{count} = $noipent->{count};
+ $self->{entry}->{totscore} = $noipent->{totscore};
}
}
}

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Thu Jan 29 18:54:33 2004
@@ -47,7 +47,6 @@
use bytes;

use Mail::SpamAssassin;
-use Mail::SpamAssassin::BayesStore;
use Mail::SpamAssassin::PerMsgStatus;
use Mail::SpamAssassin::SHA1 qw(sha1);

@@ -220,6 +219,7 @@
sub new {
my $class = shift;
$class = ref($class) || $class;
+
my ($main) = @_;
my $self = {
'main' => $main,
@@ -235,7 +235,21 @@
};
bless ($self, $class);

- $self->{store} = new Mail::SpamAssassin::BayesStore ($self);
+ if ($self->{conf}->{bayes_store_module}) {
+ my $module = $self->{conf}->{bayes_store_module};
+ my $store;
+
+ eval '
+ require '.$module.';
+ $store = '.$module.'->new($self);
+ ';
+ if ($@) { die $@; }
+ $self->{store} = $store;
+ }
+ else {
+ require Mail::SpamAssassin::BayesStoreDBM;
+ $self->{store} = Mail::SpamAssassin::BayesStoreDBM->new($self);
+ }

$self;
}
@@ -326,9 +340,6 @@

my $in_headers = ($tokprefix ne '');

- my($bv) = ($self->{store}->get_magic_tokens())[6];
- my $magic_re = $self->{store}->get_magic_re($bv);
-
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
@@ -354,7 +365,7 @@
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens

- next if ( $token =~ /$magic_re/ ); # skip false magic tokens
+ next if ( $self->{store}->is_magic_token($token) ); # skip false magic tokens

# *do* keep 3-byte tokens; there's some solid signs in there
my $len = length($token);
@@ -714,9 +725,9 @@
}

$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
- $self->{store}->add_touches_to_journal();
-
+ $self->{store}->cleanup();
dbg("bayes: Learned '$msgid'");
+
1;
}

@@ -800,7 +811,7 @@
}

$self->{store}->seen_delete ($msgid);
- $self->{store}->add_touches_to_journal();
+ $self->{store}->cleanup();
1;
}

@@ -872,8 +883,8 @@
my ($self, $sync, $expire, $opts) = @_;
if (!$self->{conf}->{use_bayes}) { return 0; }

- dbg("Syncing Bayes journal and expiring old tokens...");
- $self->{store}->sync_journal($opts) if ( $sync );
+ dbg("Syncing Bayes and expiring old tokens...");
+ $self->{store}->sync($opts) if ( $sync );
$self->{store}->expire_old_tokens($opts) if ( $expire );
dbg("Syncing complete.");

@@ -884,9 +895,13 @@

# compute the probability that that token is spammish
sub compute_prob_for_token {
- my ($self, $token, $ns, $nn) = @_;
+ my ($self, $token, $ns, $nn, $s, $n, $atime) = @_;

- my ($s, $n, $atime) = $self->{store}->tok_get ($token);
+ # we allow the caller to give us the token information, just
+ # to save a potentially expensive lookup
+ if (!defined($s) || !defined($n) || !defined($atime)) {
+ ($s, $n, $atime) = $self->{store}->tok_get ($token);
+ }
return if ($s == 0 && $n == 0);

if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
@@ -1094,7 +1109,7 @@
print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
}

- $self->{store}->add_touches_to_journal();
+ $self->{store}->cleanup();

$self->opportunistic_calls();
$self->{store}->untie_db();
@@ -1109,17 +1124,17 @@
sub opportunistic_calls {
my($self) = @_;

- # Is an expire or journal sync running?
+ # Is an expire or sync running?
my $running_expire = $self->{store}->get_running_expire_tok();
if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { return; }

- # handle expiry and journal syncing
+ # handle expiry and syncing
if ($self->{store}->expiry_due()) {
$self->{store}->set_running_expire_tok();
$self->sync(1,1);
# don't need to unlock since the expire will have done that. ;)
}
- elsif ( $self->{store}->journal_sync_due() ) {
+ elsif ( $self->{store}->sync_due() ) {
$self->{store}->set_running_expire_tok();
$self->sync(1,0);
$self->{store}->remove_running_expire_tok();
@@ -1219,8 +1234,11 @@

return 0 unless $self->{conf}->{use_bayes};
return 0 unless $self->{store}->tie_db_readonly();
+
+ my @vars = $self->{store}->get_storage_variables();
+
+ my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;

- my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = $self->{store}->get_magic_tokens();
$sb = $self->{store}->scan_count_get() if ( $bv < 1 ); # we want current scan count, not scan base count

my $template = '%3.3f %10d %10d %10d %s'."\n";
@@ -1242,18 +1260,8 @@
}

if ( $toks ) {
- my $magic_re = $self->{store}->get_magic_re($bv);
-
- foreach my $tok (keys %{$self->{store}->{db_toks}}) {
- next if ($tok =~ /$magic_re/); # skip magic tokens
- next if (defined $regex && ($tok !~ /$regex/o));
-
- my $prob = $self->compute_prob_for_token($tok, $ns, $nh);
- $prob ||= 0.5;
-
- my ($ts, $th, $atime) = $self->{store}->tok_get ($tok);
- printf $template,$prob,$ts,$th,$atime,$tok;
- }
+ # let the store sort out the db_toks
+ $self->{store}->dump_db_toks($template, $regex, @vars);
}

if (!$self->{main}->{learn_caller_will_untie}) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm Thu Jan 29 18:54:33 2004
@@ -14,103 +14,77 @@
# limitations under the License.
# </@LICENSE>

+=head1 NAME
+
+Mail::SpamAssassin::BayesStore - Bayesian Storage Module
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This is the public API for the Bayesian store methods. Any implementation of
+the storage module must implement these methods.
+
+=cut
+
package Mail::SpamAssassin::BayesStore;

use strict;
use bytes;
-use Fcntl;

-use Mail::SpamAssassin;
-use Mail::SpamAssassin::Util;
-use File::Basename;
-use File::Spec;
-use File::Path;
-
-use constant HAS_DB_FILE => eval { require DB_File; };
-
-use vars qw{
- @ISA
- @DBNAMES @DB_EXTENSIONS
- $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN
- $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN
- $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN
- $NEWEST_TOKEN_AGE_MAGIC_TOKEN
-};
-
-@ISA = qw();
-
-# db layout (quoting Matt):
-#
-# > need five db files though to make it real fast:
-# [probs] 1. ngood and nbad (two entries, so could be a flat file rather
-# than a db file). (now 2 entries in db_toks)
-# [toks] 2. good token -> number seen
-# [toks] 3. bad token -> number seen (both are packed into 1 entry in 1 db)
-# [probs] 4. Consolidated good token -> probability
-# [probs] 5. Consolidated bad token -> probability
-# > As you add new mails, you update the entry in 2 or 3, then regenerate
-# > the entry for that token in 4 or 5.
-# > Then as you test a new mail, you just need to pull the probability
-# > direct from 4 and 5, and generate the overall probability. A simple and
-# > very fast operation.
-#
-# jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
-#
-# update: probs is no longer maintained as a db, to keep on-disk and in-core
-# usage down.
-#
-# also, added a new one to support forgetting, auto-learning, and
-# auto-forgetting for refiled mails:
-# [seen] 6. a list of Message-IDs of messages already learnt from. values
-# are 's' for learnt-as-spam, 'h' for learnt-as-ham.
-#
-# and another, called [scancount] to model the scan-count for expiry.
-# This is not a database. Instead it increases by one byte for each
-# message scanned (note: scanned, not learned).
-
-@DBNAMES = qw(toks seen);
-
-# Possible file extensions used by the kinds of database files DB_File
-# might create. We need these so we can create a new file and rename
-# it into place.
-@DB_EXTENSIONS = ('', '.db');
-
-# These are the magic tokens we use to track stuff in the DB.
-# The format is '^M^A^G^I^C' followed by any string you want.
-# None of the control chars will be in a real token.
-$DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION";
-$LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
-$LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
-$LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
-$LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
-$NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
-$NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
-$NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
-$NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
-$OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
-$RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
+=head1 METHODS
+
+=head2 new
+
+public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Bayes $bayes)

-use constant DB_VERSION => 2; # what version of DB do we use?
+Description:
+This method creates a new instance of the Mail::SpamAssassin::BayesStore
+object. You must pass in an instance of the Mail::SpamAssassin:Bayes object,
+which is stashed for use throughout the module.

-###########################################################################
+=cut

sub new {
- my $class = shift;
+ my ($class, $bayes) = @_;
+
$class = ref($class) || $class;
- my ($bayes) = @_;
+
my $self = {
- 'bayes' => $bayes,
- 'already_tied' => 0,
- 'is_locked' => 0,
- 'string_to_journal' => '',
- 'db_version' => undef,
- };
+ 'bayes' => $bayes,
+ 'supported_db_version' => 0,
+ 'db_version' => undef,
+ };
+
bless ($self, $class);

$self;
}

-###########################################################################
+=head2 DB_VERSION
+
+public instance (Integer) DB_VERSION ()
+
+Description:
+This method returns the currently supported database version for the
+implementation.
+
+=cut
+
+sub DB_VERSION {
+ my ($self) = @_;
+ return $self->{supported_db_version};
+}
+
+=head2 read_db_configs
+
+public instance () read_db_configs ()
+
+Description:
+This method reads any needed config variables from the configuration
+object and then calls the Mail::SpamAssassin::Bayes read_db_configs method.
+
+=cut

sub read_db_configs {
my ($self) = @_;
@@ -133,342 +107,95 @@
$self->{bayes}->read_db_configs();
}

-###########################################################################
-
-sub tie_db_readonly {
- my ($self) = @_;
+=head2 tie_db_readonly

- if (!HAS_DB_FILE) {
- dbg ("bayes: DB_File module not installed, cannot use Bayes");
- return 0;
- }
+public instance (Boolean) tie_db_readonly ()

- # return if we've already tied to the db's, using the same mode
- # (locked/unlocked) as before.
- return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
-
- my $main = $self->{bayes}->{main};
- if (!defined($main->{conf}->{bayes_path})) {
- dbg ("bayes_path not defined");
- return 0;
- }
+Description:
+This method opens up the database in readonly mode.

- $self->read_db_configs();
+=cut

- my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
- my $found=0;
- for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
-
- if (!$found) {
- dbg ("bayes: no dbs present, cannot scan: ${path}_toks");
- return 0;
- }
-
- foreach my $dbname (@DBNAMES) {
- my $name = $path.'_'.$dbname;
- my $db_var = 'db_'.$dbname;
- dbg("bayes: $$ tie-ing to DB file R/O $name");
- # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
- tie %{$self->{$db_var}},"DB_File",$name, O_RDONLY,
- (oct ($main->{conf}->{bayes_file_mode}) & 0666)
- or goto failed_to_tie;
- }
+sub tie_db_readonly {
+ my ($self) = @_;
+ die "tie_db_readonly: not implemented\n";
+}

- $self->{db_version} = ($self->get_magic_tokens())[6];
- dbg("bayes: found bayes db version ".$self->{db_version});
+=head2 tie_db_writable

- # If the DB version is one we don't understand, abort!
- if ( $self->check_db_version() ) {
- dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
- $self->untie_db();
- return 0;
- }
+public instance (Boolean) tie_db_writable ()

- if ( $self->{db_version} < 2 ) { # older versions use scancount
- $self->{scan_count_little_file} = $path.'_msgcount';
- }
+Description:
+This method opens up the database in writable mode.

- $self->{already_tied} = 1;
- return 1;
+Any callers of this methods should ensure that they call untie_db()
+afterwards.

-failed_to_tie:
- warn "Cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
- return 0;
-}
+=cut

-# tie() to the databases, read-write and locked. Any callers of
-# this should ensure they call untie_db() afterwards!
-#
sub tie_db_writable {
my ($self) = @_;
-
- if (!HAS_DB_FILE) {
- dbg ("bayes: DB_File module not installed, cannot use Bayes");
- return 0;
- }
-
- # return if we've already tied to the db's, using the same mode
- # (locked/unlocked) as before.
- return 1 if ($self->{already_tied} && $self->{is_locked} == 1);
-
- my $main = $self->{bayes}->{main};
- if (!defined($main->{conf}->{bayes_path})) {
- dbg ("bayes_path not defined");
- return 0;
- }
-
- $self->read_db_configs();
-
- my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
- my $found=0;
- for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
-
- my $parentdir = dirname ($path);
- if (!-d $parentdir) {
- # run in an eval(); if mkpath has no perms, it calls die()
- eval {
- mkpath ($parentdir, 0, (oct ($main->{conf}->{bayes_file_mode}) & 0777));
- };
- }
-
- my $tout;
- if ($main->{learn_wait_for_lock}) {
- $tout = 300; # TODO: Dan to write better lock code
- } else {
- $tout = 10;
- }
- if ($main->{locker}->safe_lock ($path, $tout)) {
- $self->{locked_file} = $path;
- $self->{is_locked} = 1;
- } else {
- warn "Cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
- return 0;
- }
-
- my $umask = umask 0;
- foreach my $dbname (@DBNAMES) {
- my $name = $path.'_'.$dbname;
- my $db_var = 'db_'.$dbname;
- dbg("bayes: $$ tie-ing to DB file R/W $name");
- tie %{$self->{$db_var}},"DB_File",$name, O_RDWR|O_CREAT,
- (oct ($main->{conf}->{bayes_file_mode}) & 0666)
- or goto failed_to_tie;
- }
- umask $umask;
-
- # set our cache to what version DB we're using
- $self->{db_version} = ($self->get_magic_tokens())[6];
- dbg("bayes: found bayes db version ".$self->{db_version});
-
- # figure out if we can read the current DB and if we need to do a
- # DB version update and do it if necessary if either has a problem,
- # fail immediately
- #
- if ( $found && $self->upgrade_db() ) {
- $self->untie_db();
- return 0;
- }
- elsif ( !$found ) { # new DB, make sure we know that ...
- $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = DB_VERSION;
- $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
- dbg("bayes: new db, set db version ".$self->{db_version}." and 0 tokens");
- }
-
- $self->{already_tied} = 1;
- return 1;
-
-failed_to_tie:
- my $err = $!;
- umask $umask;
- if ($self->{is_locked}) {
- $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
- $self->{is_locked} = 0;
- }
- warn "Cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
- return 0;
+ die "tie_db_writable: not implemented\n";
}

-# Do we understand how to deal with this DB version?
-sub check_db_version {
- my ($self) = @_;
- my $db_ver = ($self->get_magic_tokens())[6];
+=head2 untie_db

- if ( $db_ver > DB_VERSION ) { # current DB is newer, ignore the DB!
- warn "bayes: Found DB Version $db_ver, but can only handle up to version ".DB_VERSION."\n";
- return 1;
- }
+public instance () untie_db ()

- return 0;
-}
+Description:
+This method unties the database.

-# Check to see if we need to upgrade the DB, and do so if necessary
-sub upgrade_db {
- my ($self) = @_;
-
- return 0 if ( $self->{db_version} == DB_VERSION );
- if ( $self->check_db_version() ) {
- dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
- return 1;
- }
-
- # If the current DB version is lower than the new version, upgrade!
- # Do conversions in order so we can go 1 -> 3, make sure to update $self->{db_version}
-
- dbg("bayes: detected bayes db format ".$self->{db_version}.", upgrading");
+=cut

- # since DB_File will not shrink a database (!!), we need to *create*
- # a new one instead.
- my $main = $self->{bayes}->{main};
- my $path = $main->sed_path ($main->{conf}->{bayes_path});
- my $name = $path.'_toks';
-
- # older version's journal files are likely not in the same format as the new ones, so remove it.
- my $jpath = $self->get_journal_filename();
- if ( -f $jpath ) {
- dbg("bayes: old journal file found, removing.");
- warn "Couldn't remove $jpath: $!" if ( !unlink $jpath );
- }
-
- if ( $self->{db_version} < 2 ) {
- dbg ("bayes: upgrading database format from v".$self->{db_version}." to v2");
-
- my($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
- my($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
+sub untie_db {
+ my $self = shift;
+ die "untie_db: not implemented\n";
+}

- # Magic tokens for version 0, defined as '**[A-Z]+'
- if ( $self->{db_version} == 0 ) {
- $DB_NSPAM_MAGIC_TOKEN = '**NSPAM';
- $DB_NHAM_MAGIC_TOKEN = '**NHAM';
- $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
- #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
- #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
- #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
- #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE';
- }
- else {
- $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
- $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
- $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
- #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
- #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
- #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
- #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
- }
+=head2 calculate_expire_delta

- # remember when we started ...
- my $started = time;
- my $newatime = $started;
-
- # use O_EXCL to avoid races (bonus paranoia, since we should be locked
- # anyway)
- my %new_toks;
- my $umask = umask 0;
- tie %new_toks, "DB_File", "${name}.new", O_RDWR|O_CREAT|O_EXCL,
- (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
- umask $umask;
-
- # add the magic tokens to the new db.
- $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
- $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
- $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
- $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
- $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
- $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
- $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
- $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
- $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
- $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
-
- my $magic_re = $self->get_magic_re($self->{db_version});
-
- # deal with the data tokens
- my ($tok, $packed);
- while (($tok, $packed) = each %{$self->{db_toks}}) {
- next if ($tok =~ /$magic_re/); # skip magic tokens
+public instance (\%) calculate_expire_delta (Integer $newest_atime,
+ Integer $start,
+ Integer $max_expire_mult)

- my ($ts, $th, $atime) = $self->tok_unpack ($packed);
- $new_toks{$tok} = $self->tok_pack ($ts, $th, $newatime);
- }
+Description:
+This method performs a calculation on the data to determine the optimum
+atime for token expiration.

+=cut

- # now untie so we can do renames
- untie %{$self->{db_toks}};
- untie %new_toks;
-
- # This is the critical phase (moving files around), so don't allow
- # it to be interrupted.
- local $SIG{'INT'} = 'IGNORE';
- local $SIG{'HUP'} = 'IGNORE';
- local $SIG{'TERM'} = 'IGNORE';
-
- # older versions used scancount, so kill the stupid little file ...
- my $msgc = $path.'_msgcount';
- if ( -f $msgc ) {
- dbg("bayes: old msgcount file found, removing.");
- if ( !unlink $msgc ) {
- warn "Couldn't remove $msgc: $!";
- }
- }
+sub calculate_expire_delta {
+ my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+ die "calculate_expire_delta: not implemented\n";
+}

- # now rename in the new one. Try several extensions
- for my $ext (@DB_EXTENSIONS) {
- my $newf = $name.'.new'.$ext;
- my $oldf = $name.$ext;
- next unless (-f $newf);
- if (!rename ($newf, $oldf)) {
- warn "rename $newf to $oldf failed: $!\n";
- return 1;
- }
- }
+=head2 token_expiration

- # re-tie to the new db in read-write mode ...
- tie %{$self->{db_toks}},"DB_File", $name, O_RDWR|O_CREAT,
- (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+public instance (Integer, Integer,
+ Integer, Integer) token_expiration(\% $opts,
+ Integer $newest_atime,
+ Integer $newdelta)

- dbg ("bayes: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds");
- $self->{db_version} = 2; # need this for other functions which check
- }
+Description:
+This method performs the database specific expiration of tokens based on
+the passed in C<$newest_atime> and C<$newdelta>.

- # if ( $self->{db_version} == 2 ) {
- # ...
- # $self->{db_version} = 3; # need this for other functions which check
- # }
- # ... and so on.
+=cut

- return 0;
+sub token_expiration {
+ my ($self, $opts, $newest_atime, $newdelta) = @_;
+ die "token_expiration: not implemented\n";
}

-###########################################################################
-
-sub untie_db {
- my $self = shift;
- dbg("bayes: $$ untie-ing");
-
- foreach my $dbname (@DBNAMES) {
- my $db_var = 'db_'.$dbname;
+=head2 expire_old_tokens

- if (exists $self->{$db_var}) {
- dbg ("bayes: $$ untie-ing $db_var");
- untie %{$self->{$db_var}};
- delete $self->{$db_var};
- }
- }
+public instance (Boolean) expire_old_tokens (\% hashref)

- if ($self->{is_locked}) {
- dbg ("bayes: files locked, now unlocking lock");
- $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
- $self->{is_locked} = 0;
- }
-
- $self->{already_tied} = 0;
- $self->{db_version} = undef;
-}
+Description:
+This method expires old tokens from the database.

-###########################################################################
+=cut

-# Do an expiry run.
sub expire_old_tokens {
my ($self, $opts) = @_;
my $ret;
@@ -492,6 +219,16 @@
$ret;
}

+=head2 expire_old_tokens_trapped
+
+public instance (Boolean) expire_old_tokens_trapped (\% $opts)
+
+Description:
+This methods does the actual token expiration.
+
+XXX More docs here about the methodology and what not
+=cut
+
sub expire_old_tokens_trapped {
my ($self, $opts) = @_;

@@ -504,27 +241,8 @@
return 0;
}

- my $deleted = 0;
- my $kept = 0;
- my $num_lowfreq = 0;
- my $num_hapaxes = 0;
my $started = time();
- my @magic = $self->get_magic_tokens();
-
- # since DB_File will not shrink a database (!!), we need to *create*
- # a new one instead.
- my $main = $self->{bayes}->{main};
- my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
- # use a temporary PID-based suffix just in case another one was
- # created previously by an interrupted expire
- my $tmpsuffix = "expire$$";
- my $tmpdbname = $path.'_toks.'.$tmpsuffix;
-
- my $magic_re = $self->get_magic_re(DB_VERSION);
-
- # Figure out atime delta as necessary
- my $too_old = 0;
+ my @vars = $self->get_storage_variables();

# How many tokens do we want to keep?
my $goal_reduction = int($self->{expiry_max_db_size} * 0.75); # expire to 75% of max_db
@@ -535,32 +253,32 @@
dbg("bayes: expiry keep size too small, resetting to 100,000 tokens");
}
# Now turn goal_reduction into how many to expire.
- $goal_reduction = $magic[3] - $goal_reduction;
- dbg("bayes: token count: ".$magic[3].", final goal reduction size: $goal_reduction");
+ $goal_reduction = $vars[3] - $goal_reduction;
+ dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction");

if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort.
dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens. skipping expire.");
- $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+ $self->set_last_expire(time());
$self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
return 1; # we want to indicate things ran as expected
}

# Estimate new atime delta based on the last atime delta
my $newdelta = 0;
- if ( $magic[9] > 0 ) {
+ if ( $vars[9] > 0 ) {
# newdelta = olddelta * old / goal;
# this may seem backwards, but since we're talking delta here,
# not actual atime, we want smaller atimes to expire more tokens,
# and visa versa.
#
- $newdelta = int($magic[8] * $magic[9] / $goal_reduction);
+ $newdelta = int($vars[8] * $vars[9] / $goal_reduction);
}

# Calculate size difference between last expiration token removal
# count and the current goal removal count.
- my $ratio = ($magic[9] == 0 || $magic[9] > $goal_reduction) ? $magic[9]/$goal_reduction : $goal_reduction/$magic[9];
+ my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9];

- dbg("bayes: First pass? Current: ".time().", Last: ".$magic[4].", atime: ".$magic[8].", count: ".$magic[9].", newdelta: $newdelta, ratio: $ratio");
+ dbg("bayes: First pass? Current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio");

## ESTIMATION PHASE
#
@@ -577,32 +295,16 @@
# - difference of last reduction to current goal reduction is > 50%
# if the two values are out of balance, estimating atime is going to be funky, recompute
#
- if ( (time() - $magic[4] > 86400*30) || ($magic[8] < 43200) || ($magic[9] < 1000) || ($newdelta < 43200) || ($ratio > 1.5) ) {
+ if ( (time() - $vars[4] > 86400*30) || ($vars[8] < 43200) || ($vars[9] < 1000)
+ || ($newdelta < 43200) || ($ratio > 1.5) ) {
dbg("bayes: Can't use estimation method for expiry, something fishy, calculating optimal atime delta (first pass)");
+
my $start = 43200; # exponential search starting at ...? 1/2 day, 1, 2, 4, 8, 16, ...
- my %delta = (); # use a hash since an array is going to be very sparse
my $max_expire_mult = 512; # $max_expire_mult * $start = max expire time (256 days), power of 2.

- # do the first pass, figure out atime delta
- my ($tok, $packed);
- while (($tok, $packed) = each %{$self->{db_toks}}) {
- next if ($tok =~ /$magic_re/); # skip magic tokens
-
- my ($ts, $th, $atime) = $self->tok_unpack ($packed);
-
- # Go through from $start * 1 to $start * 512, mark how many tokens we would expire
- my $token_age = $magic[10] - $atime;
- for( my $i = 1; $i <= $max_expire_mult; $i<<=1 ) {
- if ( $token_age >= $start * $i ) {
- $delta{$i}++;
- }
- else {
- # If the token age is less than the expire delta, it'll be
- # less for all upcoming checks too, so abort early.
- last;
- }
- }
- }
+ my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult);
+
+ return 0 unless (%delta);

# This will skip the for loop if debugging isn't enabled ...
if ( $Mail::SpamAssassin::DEBUG->{'enabled'} ) {
@@ -612,7 +314,7 @@
dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0));
}
}
-
+
# Now figure out which max_expire_mult value gives the closest results to goal_reduction, without
# going over ... Go from the largest delta backwards so the reduction size increases
# (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...)
@@ -621,7 +323,7 @@
next unless exists $delta{$max_expire_mult};
if ($delta{$max_expire_mult} > $goal_reduction) {
$max_expire_mult<<=1; # the max expire is actually the next power of 2 out
- last;
+ last;
}
}

@@ -637,7 +339,7 @@
#
if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) {
dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire.");
- $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+ $self->set_last_expire(time());
$self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
return 1; # we want to indicate things ran as expected
}
@@ -649,92 +351,7 @@
dbg("bayes: Can do estimation method for expiry, skipping first pass.");
}

- # clean out any leftover db copies from previous runs
- for my $ext (@DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
-
- # use O_EXCL to avoid races (bonus paranoia, since we should be locked
- # anyway)
- my %new_toks;
- my $umask = umask 0;
- tie %new_toks, "DB_File", $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
- (oct ($main->{conf}->{bayes_file_mode}) & 0666);
- umask $umask;
- my $oldest;
-
- my $showdots = $opts->{showdots};
- if ($showdots) { print STDERR "\n"; }
-
- # We've chosen a new atime delta if we've gotten here, so record it for posterity.
- $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
-
- # Figure out how old is too old...
- $too_old = $magic[10] - $newdelta; # tooold = newest - delta
-
- # Go ahead and do the move to new db/expire run now ...
- my ($tok, $packed);
- while (($tok, $packed) = each %{$self->{db_toks}}) {
- next if ($tok =~ /$magic_re/); # skip magic tokens
-
- my ($ts, $th, $atime) = $self->tok_unpack ($packed);
-
- if ($atime < $too_old) {
- $deleted++;
- } else {
- $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
- if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
- if ($ts + $th == 1) {
- $num_hapaxes++;
- } elsif ($ts < 8 && $th < 8) {
- $num_lowfreq++;
- }
- }
-
- if ((($kept + $deleted) % 1000) == 0) {
- if ($showdots) { print STDERR "."; }
- $self->set_running_expire_tok();
- }
- }
-
- # and add the magic tokens. don't add the expire_running token.
- $new_toks{$DB_VERSION_MAGIC_TOKEN} = DB_VERSION;
-
- # We haven't changed messages of each type seen, so just copy over.
- $new_toks{$NSPAM_MAGIC_TOKEN} = $magic[1];
- $new_toks{$NHAM_MAGIC_TOKEN} = $magic[2];
-
- # We magically haven't removed the newest token, so just copy that value over.
- $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $magic[10];
-
- # The rest of these have been modified, so replace as necessary.
- $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
- $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
- $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
- $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
-
- # now untie so we can do renames
- untie %{$self->{db_toks}};
- untie %new_toks;
-
- # This is the critical phase (moving files around), so don't allow
- # it to be interrupted. Scope the signal changes.
- {
- local $SIG{'INT'} = 'IGNORE';
- local $SIG{'HUP'} = 'IGNORE';
- local $SIG{'TERM'} = 'IGNORE';
-
- # now rename in the new one. Try several extensions
- for my $ext (@DB_EXTENSIONS) {
- my $newf = $tmpdbname.$ext;
- my $oldf = $path.'_toks'.$ext;
- next unless (-f $newf);
- if (!rename ($newf, $oldf)) {
- warn "rename $newf to $oldf failed: $!\n";
- }
- }
- }
-
- # Call untie_db() so we unlock correctly.
- $self->untie_db();
+ my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars);

my $done = time();

@@ -742,6 +359,7 @@
my $msg2 = "$kept entries kept, $deleted deleted";

if ($opts->{verbose}) {
+
my $hapax_pc = ($num_hapaxes * 100) / $kept;
my $lowfreq_pc = ($num_lowfreq * 100) / $kept;
print "$msg\n$msg2\n";
@@ -751,53 +369,46 @@
dbg ("$msg: $msg2");
}

- 1;
+ return 1;
}

-###########################################################################
+=head2 sync_due

-# Is a journal sync due?
-sub journal_sync_due {
- my ($self) = @_;
+public instance (Boolean) sync_due ()

- return 0 if ( $self->{db_version} < DB_VERSION ); # don't bother doing old db versions
+Description:
+This methods determines if a sync is due.

- my $conf = $self->{bayes}->{main}->{conf};
- return 0 if ( $conf->{bayes_journal_max_size} == 0 );
+=cut

- my @magic = $self->get_magic_tokens();
- dbg("Bayes DB journal sync: last sync: ".$magic[7],'bayes','-1');
+sub sync_due {
+ my ($self) = @_;
+ die "sync_due: not implemented\n";
+}

- ## Ok, should we do a sync?
+=head2 expiry_due

- # Not if the journal file doesn't exist, it's not a file, or it's 0 bytes long.
- return 0 unless (stat($self->get_journal_filename()) && -f _);
+public instance (Boolean) expiry_due ()

- # Yes if the file size is larger than the specified maximum size.
- return 1 if (-s _ > $conf->{bayes_journal_max_size});
+Description:
+This methods determines if an expire is due.

- # Yes if it's been at least a day since the last sync.
- return 1 if (time - $magic[7] > 86400);
+=cut

- # No, I guess not.
- return 0;
-}
-
-# Is an expiry run due to occur?
sub expiry_due {
my ($self) = @_;

$self->read_db_configs(); # make sure this has happened here

- # is the database too small for expiry? (Do *not* use "scalar keys",
- # as this will iterate through the entire db counting them!)
- my @magic = $self->get_magic_tokens();
- my $ntoks = $magic[3];
-
# If force expire was called, do the expire no matter what.
return 1 if ($self->{bayes}->{main}->{learn_force_expire});

- my $last_expire = time() - $magic[4];
+ # is the database too small for expiry? (Do *not* use "scalar keys",
+ # as this will iterate through the entire db counting them!)
+ my @vars = $self->get_storage_variables();
+ my $ntoks = $vars[3];
+
+ my $last_expire = time() - $vars[4];
if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) {
# if we're not ignoring the safety timeout, don't run an expire more
# than once every 12 hours.
@@ -809,14 +420,14 @@
return 0 if ($last_expire < 300);
}

- dbg("Bayes DB expiry: Tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$magic[5].", Newest atime: ".$magic[10].", Last expire: ".$magic[4].", Current time: ".time(),'bayes','-1');
+ dbg("Bayes DB expiry: Tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time(),'bayes','-1');

my $conf = $self->{bayes}->{main}->{conf};
if ($ntoks <= 100000 || # keep at least 100k tokens
$conf->{bayes_auto_expire} == 0 || # config says don't expire
$self->{expiry_max_db_size} > $ntoks || # not enough tokens to cause an expire
- $magic[10]-$magic[5] < 43200 || # delta between oldest and newest < 12h
- $self->{db_version} < DB_VERSION # ignore old db formats
+ $vars[10]-$vars[5] < 43200 || # delta between oldest and newest < 12h
+ $self->{db_version} < $self->DB_VERSION # ignore old db formats
) {
return 0;
}
@@ -824,686 +435,315 @@
return 1;
}

-###########################################################################
-# db_seen reading APIs
+=head2 seen_get

-sub seen_get {
- my ($self, $msgid) = @_;
- $self->{db_seen}->{$msgid};
-}
+public instance (Char) seen_get (String $msgid)

-sub seen_put {
- my ($self, $msgid, $seen) = @_;
+Description:
+This method retrieves the stored value, if any, for C<$msgid>. The return
+value is the stored string ('s' for spam and 'h' for ham) or undef if
+C<$msgid> is not found.

- if ($self->{bayes}->{main}->{learn_to_journal}) {
- $self->defer_update ("m $seen $msgid");
- }
- else {
- $self->{db_seen}->{$msgid} = $seen;
- }
-}
+=cut

-sub seen_delete {
+sub seen_get {
my ($self, $msgid) = @_;
-
- if ($self->{bayes}->{main}->{learn_to_journal}) {
- $self->defer_update ("m f $msgid");
- }
- else {
- delete $self->{db_seen}->{$msgid};
- }
+ die "seen_get: not implemented\n";
}

-###########################################################################
-# db reading APIs
+=head2 seen_put

-sub tok_get {
- my ($self, $tok) = @_;
- $self->tok_unpack ($self->{db_toks}->{$tok});
-}
-
-sub nspam_nham_get {
- my ($self) = @_;
- my @magic = $self->get_magic_tokens();
- ($magic[1], $magic[2]);
-}
+public instance (Boolean) seen_put (String $msgid, Char $flag)

-# return the magic tokens in a specific order:
-# 0: scan count base
-# 1: number of spam
-# 2: number of ham
-# 3: number of tokens in db
-# 4: last expire atime
-# 5: oldest token in db atime
-# 6: db version value
-# 7: last journal sync
-# 8: last atime delta
-# 9: last expire reduction count
-# 10: newest token in db atime
-#
-sub get_magic_tokens {
- my ($self) = @_;
- my @values;
+Description:
+This method records C<$msgid> as the type given by C<$flag>. C<$flag> is
+one of two values 's' for spam and 'h' for ham.

- my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
- if ( !$db_ver || $db_ver =~ /\D/ ) { $db_ver = 0; }
+=cut

- if ( $db_ver == 0 ) {
- my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
- my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
- my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
- my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
- my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
- my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
-
- @values = (
- $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
- $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
- 0,
- 0,
- 0,
- 0,
- 0,
- );
- }
- elsif ( $db_ver == 1 ) {
- my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
- my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
- my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
- my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
- my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
- my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
-
- @values = (
- $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
- $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
- 1,
- 0,
- 0,
- 0,
- 0,
- );
- }
- elsif ( $db_ver == 2 ) {
- my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
- my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
- my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
- my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
- my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
- my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
- my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
- my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
- my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
- my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
-
- @values = (
- 0,
- $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
- 2,
- $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
- $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
- );
- }
+sub seen_put {
+ my ($self, $msgid, $flag) = @_;
+ die "seen_put: not implemented\n";
+}

+=head2 seen_delete

- foreach ( @values ) {
- if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
- }
+public instance (Boolean) seen_delete (String $msgid)

- return @values;
-}
+Description:
+This method removes C<$msgid> from storage.

+=cut

-## Don't bother using get_magic_tokens here. This token should only
-## ever exist when we're running expire, so we don't want to convert it if
-## it's there and we're not expiring ...
-sub get_running_expire_tok {
- my ($self) = @_;
- my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
- if (!$running || $running =~ /\D/) { return undef; }
- return $running;
+sub seen_delete {
+ my ($self, $msgid) = @_;
+ die "seen_delete: not implemented\n";
}

-sub set_running_expire_tok {
- my ($self) = @_;
- $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
-}
+=head2 get_storage_variables

-sub remove_running_expire_tok {
- my ($self) = @_;
- delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
-}
+public instance (@) get_storage_variables ()

-###########################################################################
+Description:
+This method retrieves the various administrative variables used by
+the Bayes storage implementation.

-# db abstraction: allow deferred writes, since we will be frequently
-# writing while checking.
+The values returned in the array are in the following order:

-sub tok_count_change {
- my ($self, $ds, $dh, $tok, $atime) = @_;
+0: scan count base

- $atime = 0 unless defined $atime;
+1: number of spam

- if ($self->{bayes}->{main}->{learn_to_journal}) {
- $self->defer_update ("c $ds $dh $atime $tok");
- } else {
- $self->tok_sync_counters ($ds, $dh, $atime, $tok);
- }
-}
-
-sub nspam_nham_change {
- my ($self, $ds, $dh) = @_;
+2: number of ham

- if ($self->{bayes}->{main}->{learn_to_journal}) {
- $self->defer_update ("n $ds $dh");
- } else {
- $self->tok_sync_nspam_nham ($ds, $dh);
- }
-}
+3: number of tokens in db

-sub tok_touch {
- my ($self, $tok, $atime) = @_;
- $self->defer_update ("t $atime $tok");
-}
+4: last expire atime

-sub defer_update {
- my ($self, $str) = @_;
- $self->{string_to_journal} .= "$str\n";
-}
+5: oldest token in db atime

-###########################################################################
+6: db version value

-sub add_touches_to_journal {
- my ($self) = @_;
+7: last journal sync

- my $nbytes = length ($self->{string_to_journal});
- return if ($nbytes == 0);
+8: last atime delta

- my $path = $self->get_journal_filename();
+9: last expire reduction count

- # use append mode, write atomically, then close, so simultaneous updates are
- # not lost
- my $conf = $self->{bayes}->{main}->{conf};
- my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
- if (!open (OUT, ">>".$path)) {
- warn "cannot write to $path, Bayes db update ignored\n";
- umask $umask; # reset umask
- return;
- }
+10: newest token in db atime

- # do not use print() here, it will break up the buffer if it's >8192 bytes,
- # which could result in two sets of tokens getting mixed up and their
- # touches missed.
- my $writ = 0;
- while ($writ < $nbytes) {
- my $len = syswrite (OUT, $self->{string_to_journal}, $nbytes-$writ);
-
- if (!defined $len || $len < 0) {
- # argh, write failure, give up
- $len = 0 unless ( defined $len );
- warn "write failed to Bayes journal $path ($len of $nbytes)!\n";
- last;
- }
+=cut

- $writ += $len;
- if ($len < $nbytes) {
- # this should not happen on filesystem writes! Still, try to recover
- # anyway, but be noisy about it so the admin knows
- warn "partial write to Bayes journal $path ($len of $nbytes), recovering.\n";
- $self->{string_to_journal} = substr ($self->{string_to_journal}, $len);
- }
- }
+sub get_storage_variables {
+ my ($self) = @_;
+ die "get_storage_variables: not implemented\n";
+}

- if (!close OUT) {
- warn "cannot write to $path, Bayes db update ignored\n";
- }
- umask $umask; # reset umask
+=head2 dump_db_toks

- $self->{string_to_journal} = '';
-}
+public instance () dump_db_toks (String $template, String $regex, @ @vars)

-# Return a qr'd RE to match a token with the correct format's magic token
-sub get_magic_re {
- my ($self, $db_ver) = @_;
+Description:
+This method loops over all tokens, computing the probability for the token
+and then printing it out according to the passed in template.

- if ( $db_ver >= 1 ) {
- return qr/^\015\001\007\011\003/;
- }
+=cut

- # When in doubt, assume v0
- return qr/^\*\*[A-Z]+$/;
+sub dump_db_toks {
+ my ($self, $template, $regex, @vars) = @_;
+ die "dump_db_toks: not implemented\n";
}

-###########################################################################
-# And this method reads the journal and applies the changes in one
-# (locked) transaction.
+=head2 set_last_expire

-sub sync_journal {
- my ($self, $opts) = @_;
- my $ret = 0;
+public instance (Boolean) _set_last_expire (Integer $time)

- my $path = $self->get_journal_filename();
+Description:
+This method sets the last expire time.

- # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
- if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+=cut

- eval {
- local $SIG{'__DIE__'}; # do not run user die() traps in here
- if ($self->tie_db_writable()) {
- $ret = $self->sync_journal_trapped($opts, $path);
- }
- };
- my $err = $@;
-
- # ok, untie from write-mode if we can
- if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
- $self->untie_db();
- }
-
- # handle any errors that may have occurred
- if ($err) {
- warn "bayes: $err\n";
- return 0;
- }
-
- $ret;
+sub set_last_expire {
+ my ($self, $time) = @_;
+ die "set_last_expire: not implemented\n";
}

-sub sync_journal_trapped {
- my ($self, $opts, $path) = @_;
+=head2 get_running_expire_tok

- # Flag that we're doing work
- $self->set_running_expire_tok();
+public instance (Time) get_running_expire_tok ()

- my $started = time();
- my $count = 0;
- my $total_count = 0;
- my %tokens = ();
- my $showdots = $opts->{showdots};
- my $retirepath = $path.".old";
-
- # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
- # we have to check again since the file may have been removed by a recent bayes db upgrade ...
- if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+Description:
+This method determines if an expire is currently running and returns the time
+the expire started.

- if (!-r $path) { # will we be able to read the file?
- warn "bayes: bad permissions on journal, can't read: $path\n";
- return 0;
- }
+=cut

- # This is the critical phase (moving files around), so don't allow
- # it to be interrupted.
- {
- local $SIG{'INT'} = 'IGNORE';
- local $SIG{'HUP'} = 'IGNORE';
- local $SIG{'TERM'} = 'IGNORE';
-
- # retire the journal, so we can update the db files from it in peace.
- # TODO: use locking here
- if (!rename ($path, $retirepath)) {
- warn "bayes: failed rename $path to $retirepath\n";
- return 0;
- }
+sub get_running_expire_tok {
+ my ($self) = @_;
+ die "get_running_expire_tok: not implemented\n";
+}

- # now read the retired journal
- if (!open (JOURNAL, "<$retirepath")) {
- warn "bayes: cannot open read $retirepath\n";
- return 0;
- }
+=head2 set_running_expire_tok

+public instance (Time) set_running_expire_tok ()

- # Read the journal
- while (<JOURNAL>) {
- $total_count++;
-
- if (/^t (\d+) (.*)$/) { # Token timestamp update, cache resultant entries
- $tokens{$2} = $1+0 if ( !exists $tokens{$2} || $1+0 > $tokens{$2} );
- } elsif (/^c (-?\d+) (-?\d+) (\d+) (.*)$/) { # Add/full token update
- $self->tok_sync_counters ($1+0, $2+0, $3+0, $4);
- $count++;
- } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
- $self->tok_sync_nspam_nham ($1+0, $2+0);
- $count++;
- } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
- if ( $1 eq "f" ) {
- $self->seen_delete($2);
- }
- else {
- $self->seen_put($2,$1);
- }
- $count++;
- } else {
- warn "Bayes journal: gibberish entry found: $_";
- }
- }
- close JOURNAL;
+Description:
+This method sets the running expire time to the current time.

- # Now that we've determined what tokens we need to update and their
- # final values, update the DB. Should be much smaller than the full
- # journal entries.
- while( my($k,$v) = each %tokens ) {
- $self->tok_touch_token ($v, $k);
-
- if ((++$count % 1000) == 0) {
- if ($showdots) { print STDERR "."; }
- $self->set_running_expire_tok();
- }
- }
+=cut

- if ($showdots) { print STDERR "\n"; }
+sub set_running_expire_tok {
+ my ($self) = @_;
+ die "set_running_expire_tok: not implemented\n";
+}

- # we're all done, so unlink the old journal file
- unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
+=head2 remove_running_expire_tok

- $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
+public instance (Boolean) remove_running_expire_tok ()

- my $done = time();
- my $msg = ("synced Bayes databases from journal in ".($done - $started).
- " seconds: $count unique entries ($total_count total entries)");
+Description:
+This method removes a currently set running expire time.

- if ($opts->{verbose}) {
- print $msg,"\n";
- } else {
- dbg ($msg);
- }
- }
+=cut

- # else, that's the lot, we're synced. return
- 1;
+sub remove_running_expire_tok {
+ my ($self) = @_;
+ die "remove_running_expire_tok: not implemented\n";
}

-sub tok_touch_token {
- my ($self, $atime, $tok) = @_;
- my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+=head2 tok_get
+
+public instance (Integer, Integer, Time) tok_get (String $token)
+
+Description:
+This method retrieves the specified token (C<$token>) from storage and returns
+it's spam count, ham acount and last access time.

- # If the new atime is < the old atime, ignore the update
- # We figure that we'll never want to lower a token atime, so abort if
- # we try. (journal out of sync, etc.)
- return if ( $oldatime >= $atime );
+=cut

- $self->tok_put ($tok, $ts, $th, $atime);
+sub tok_get {
+ my ($self, $token) = @_;
+ die "tok_get: not implemented\n";
}

-sub tok_sync_counters {
- my ($self, $ds, $dh, $atime, $tok) = @_;
- my ($ts, $th, $oldatime) = $self->tok_get ($tok);
- $ts += $ds; if ($ts < 0) { $ts = 0; }
- $th += $dh; if ($th < 0) { $th = 0; }
+=head2 tok_count_change

- # Don't roll the atime of tokens backwards ...
- $atime = $oldatime if ( $oldatime > $atime );
+public instance (Boolean) tok_count_change (Integer $spam_count,
+ Integer $ham_count,
+ String $token,
+ Time $atime)

- $self->tok_put ($tok, $ts, $th, $atime);
-}
+Description:
+This method takes a C<$spam_count> and C<$ham_count> and adds it to
+C<$token> along with updating C<$token>s atime with C<$atime>.

-sub tok_put {
- my ($self, $tok, $ts, $th, $atime) = @_;
- $ts ||= 0;
- $th ||= 0;
+=cut

- if ( $tok =~ /^\015\001\007\011\003/ ) { # magic token? Ignore it!
- return;
- }
+sub tok_count_change {
+ my ($self, $spam_count, $ham_count, $token, $atime) = @_;
+ die "tok_count_change: not implemented\n";
+}

- # use defined() rather than exists(); the latter is not supported
- # by NDBM_File, believe it or not. Using defined() did not
- # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
- my $exists_already = defined $self->{db_toks}->{$tok};
-
- if ($ts == 0 && $th == 0) {
- return if (!$exists_already); # If the token doesn't exist, just return
- $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
- delete $self->{db_toks}->{$tok};
- } else {
- if (!$exists_already) { # If the token doesn't exist, raise the token count
- $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
- }
+=head2 nspam_nham_get

- $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
+public instance (Integer, Integer) nspam_nham_get ()

- my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
- if (!defined ($newmagic) || $atime > $newmagic) {
- $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
- }
+Description:
+This method retrieves the total number of spam and the total number of spam
+currently under storage.

- # Make sure to check for either !defined or "" ... Apparently
- # sometimes the DB module doesn't return the value correctly. :(
- my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
- if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
- $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
- }
- }
-}
+=cut

-sub tok_sync_nspam_nham {
- my ($self, $ds, $dh) = @_;
- my ($ns, $nh) = ($self->get_magic_tokens())[1,2];
- if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
- if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
- $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
- $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
+sub nspam_nham_get {
+ my ($self) = @_;
+ die "nspam_nham_get: not implemented\n";
}

-###########################################################################
+=head2 nspam_nham_change

-sub get_journal_filename {
- my ($self) = @_;
+public instance (Boolean) nspam_nham_change (Integer $num_spam,
+ Integer $num_ham)

- if (defined $self->{journal_live_path}) {
- return $self->{journal_live_path};
- }
+Description:
+This method updates the number of spam and the number of ham in the database.

- my $main = $self->{bayes}->{main};
- my $fname = $main->sed_path ($main->{conf}->{bayes_path}."_journal");
+=cut

- $self->{journal_live_path} = $fname;
- return $self->{journal_live_path};
+sub nspam_nham_change {
+ my ($self, $num_spam, $num_ham) = @_;
+ die "nspam_nham_change: not implemented\n";
}

-###########################################################################
+=head2 tok_touch

-sub scan_count_get {
- my ($self) = @_;
+public instance (Boolean) tok_touch (String $token,
+ Time $atime)

- if ( $self->{db_version} < 2 ) {
- my ($count) = $self->get_magic_tokens();
- my $path = $self->{scan_count_little_file};
- $count += (defined $path && -e $path ? -s _ : 0);
- return $count;
- }
+Description:
+This method updates the given tokens (C<$token>) access time.

- 0;
+=cut
+
+sub tok_touch {
+ my ($self, $token, $atime) = @_;
+ die "tok_touch: not implemanted\n";
}

-###########################################################################
+=head2 cleanup

-# this is called directly from sa-learn(1).
-sub upgrade_old_dbm_files {
- my ($self, $opts) = @_;
- my $ret = 0;
+public instance (Boolean) cleanup ()

- eval {
- local $SIG{'__DIE__'}; # do not run user die() traps in here
+Description:
+This method performs any cleanup necessary before moving onto the next
+operation.

- use File::Basename;
- use File::Copy;
+=cut

- # bayes directory
- my $main = $self->{bayes}->{main};
- my $path = $main->sed_path($main->{conf}->{bayes_path});
- my $dir = dirname($path);
-
- # make temporary copy since old dbm and new dbm may have same name
- opendir(DIR, $dir) || die "can't opendir $dir: $!";
- my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
- closedir(DIR);
- if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
- {
- die "unable to find bayes_toks and bayes_seen, stopping\n";
- }
- # untaint @files (already safe after grep)
- @files = map { /(.*)/, $1 } @files;
+sub cleanup {
+ my ($self) = @_;
+ die "touches_cleanup: not implemented\n";
+}

- for (@files) {
- my $src = "$dir/$_";
- my $dst = "$dir/old_$_";
- copy($src, $dst) || die "can't copy $src to $dst: $!\n";
- }
+=head2 is_magic_token

- # delete previous to make way for import
- for (@files) { unlink("$dir/$_"); }
+public instance (Boolean) is_magic_token (string $token)

- # import
- if ($self->tie_db_writable()) {
- $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
- $self->{db_seen});
- $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
- $self->{db_toks});
- }
+Description:
+This method determines if a given token is "magic" or special to the
+implementation.

- if ($ret == 2) {
- print "import successful, original files saved with \"old\" prefix\n";
- }
- else {
- print "import failed, original files saved with \"old\" prefix\n";
- }
- };
- my $err = $@;
-
- $self->untie_db();
+=cut

- # if we died, untie the dbm files
- if ($err) {
- warn "bayes upgrade_old_dbm_files: $err\n";
- return 0;
- }
- $ret;
+sub is_magic_token {
+ my ($self, $token) = @_;
+ die "is_magic_token: not implemented\n";
}

-sub upgrade_old_dbm_files_trapped {
- my ($self, $filename, $output) = @_;
+=head2 sync

- my $count;
- my %in;
+public instance (Boolean) sync (\% $opts)

- print "upgrading to DB_File, please be patient: $filename\n";
+Description:
+This method performs a sync of the database.

- # try each type of file until we find one with > 0 entries
- for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
- $count = 0;
- # wrap in eval so it doesn't run in general use. This accesses db
- # modules directly.
- # Note: (bug 2390), the 'use' needs to be on the same line as the eval
- # for RPM dependency checks to work properly. It's lame, but...
- eval 'use ' . $dbm . ';
- tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
- %{ $output } = %in;
- $count = scalar keys %{ $output };
- untie %in;
- ';
- if ($@) {
- print "$dbm: $dbm module not installed, nothing copied.\n";
- dbg("error was: $@");
- }
- elsif ($count == 0) {
- print "$dbm: no database of that kind found, nothing copied.\n";
- }
- else {
- print "$dbm: copied $count entries.\n";
- return 1;
- }
- }
+=cut

- return 0;
+sub sync {
+ my ($self, $opts) = @_;
+ die "sync: not implemented\n";
}

-###########################################################################
+=head2 scan_count_get

-# token marshalling format for db_toks.
+public instance (Integer) scan_count_get ()

-# Since we may have many entries with few hits, especially thousands of hapaxes
-# (1-occurrence entries), use a flexible entry format, instead of simply "2
-# packed ints", to keep the memory and disk space usage down. In my
-# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
-# can use a 1-byte representation for the other 91% of low-hitting entries
-# and save masses of space.
+Description:
+This method gets the current scan count, if used by the implementation.

-# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
-# ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte
-# representation; otherwise, if XX in the first byte is 00, it's packed as
-# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
+=cut

-# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
+sub scan_count_get {
+ my ($self) = @_;
+ die "scan_count_get: not implemented\n";
+}

-use constant FORMAT_FLAG => 0xc0; # 11000000
-use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
-use constant TWO_LONGS_FORMAT => 0x00; # 00000000
+=head2 perform_upgrade

-use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
-use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
+public instance (Boolean) perform_upgrade (\% $opts)

-sub tok_unpack {
- my ($self, $value) = @_;
- $value ||= 0;
+Description:
+This method is a utility method that performs any necessary upgrades
+between versions. It should know how to handle previous versions and
+what needs to happen to upgrade them.

- my ($packed, $atime);
- if ( $self->{db_version} == 0 ) {
- ($packed, $atime) = unpack("CS", $value);
- }
- elsif ( $self->{db_version} == 1 || $self->{db_version} == 2 ) {
- ($packed, $atime) = unpack("CV", $value);
- }
+A true return value indicates success.

- if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
- return (($packed & ONE_BYTE_SSS_BITS) >> 3,
- $packed & ONE_BYTE_HHH_BITS,
- $atime || 0);
- }
- elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
- my ($packed, $ts, $th, $atime);
- if ( $self->{db_version} == 0 ) {
- ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
- }
- elsif ( $self->{db_version} == 1 ) {
- ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
- }
- elsif ( $self->{db_version} == 2 ) {
- ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
- }
- return ($ts || 0, $th || 0, $atime || 0);
- }
- # other formats would go here...
- else {
- warn "unknown packing format for Bayes db, please re-learn: $packed";
- return (0, 0, 0);
- }
-}
+=cut

-sub tok_pack {
- my ($self, $ts, $th, $atime) = @_;
- $ts ||= 0; $th ||= 0; $atime ||= 0;
- if ($ts < 8 && $th < 8) {
- return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
- } else {
- return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
- }
+sub perform_upgrade {
+ my ($self, $opts) = @_;
+ die "perform_upgrade: not implemented\n";
}
-
-###########################################################################

sub dbg { Mail::SpamAssassin::dbg (@_); }
sub sa_die { Mail::SpamAssassin::sa_die (@_); }

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm Thu Jan 29 18:54:33 2004
@@ -0,0 +1,1385 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+#
+# Copyright (c) 2000 The Apache Software Foundation. All rights
+# reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. The end-user documentation included with the redistribution,
+# if any, must include the following acknowledgment:
+# "This product includes software developed by the
+# Apache Software Foundation (http://www.apache.org/)."
+# Alternately, this acknowledgment may appear in the software itself,
+# if and wherever such third-party acknowledgments normally appear.
+#
+# 4. The names "Apache" and "Apache Software Foundation" must
+# not be used to endorse or promote products derived from this
+# software without prior written permission. For written
+# permission, please contact apache@apache.org.
+#
+# 5. Products derived from this software may not be called "Apache",
+# nor may "Apache" appear in their name, without prior written
+# permission of the Apache Software Foundation.
+#
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+#
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation. For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+#
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </@LICENSE>
+
+package Mail::SpamAssassin::BayesStoreDBM;
+
+use strict;
+use bytes;
+use Fcntl;
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::BayesStore;
+use File::Basename;
+use File::Spec;
+use File::Path;
+
+use constant HAS_DB_FILE => eval { require DB_File; };
+
+use vars qw{
+ @ISA
+ @DBNAMES @DB_EXTENSIONS
+ $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN
+ $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN
+ $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN
+ $NEWEST_TOKEN_AGE_MAGIC_TOKEN
+};
+
+@ISA = qw( Mail::SpamAssassin::BayesStore );
+
+# db layout (quoting Matt):
+#
+# > need five db files though to make it real fast:
+# [probs] 1. ngood and nbad (two entries, so could be a flat file rather
+# than a db file). (now 2 entries in db_toks)
+# [toks] 2. good token -> number seen
+# [toks] 3. bad token -> number seen (both are packed into 1 entry in 1 db)
+# [probs] 4. Consolidated good token -> probability
+# [probs] 5. Consolidated bad token -> probability
+# > As you add new mails, you update the entry in 2 or 3, then regenerate
+# > the entry for that token in 4 or 5.
+# > Then as you test a new mail, you just need to pull the probability
+# > direct from 4 and 5, and generate the overall probability. A simple and
+# > very fast operation.
+#
+# jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
+#
+# update: probs is no longer maintained as a db, to keep on-disk and in-core
+# usage down.
+#
+# also, added a new one to support forgetting, auto-learning, and
+# auto-forgetting for refiled mails:
+# [seen] 6. a list of Message-IDs of messages already learnt from. values
+# are 's' for learnt-as-spam, 'h' for learnt-as-ham.
+#
+# and another, called [scancount] to model the scan-count for expiry.
+# This is not a database. Instead it increases by one byte for each
+# message scanned (note: scanned, not learned).
+
+@DBNAMES = qw(toks seen);
+
+# Possible file extensions used by the kinds of database files DB_File
+# might create. We need these so we can create a new file and rename
+# it into place.
+@DB_EXTENSIONS = ('', '.db');
+
+# These are the magic tokens we use to track stuff in the DB.
+# The format is '^M^A^G^I^C' followed by any string you want.
+# None of the control chars will be in a real token.
+$DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION";
+$LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
+$LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
+$LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+$LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
+$NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
+$NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
+$NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
+$NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
+$OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
+$RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
+
+###########################################################################
+
+sub new {
+ my $class = shift;
+ $class = ref($class) || $class;
+
+ my $self = $class->SUPER::new(@_);
+
+ $self->{supported_db_version} = 2;
+
+ $self->{already_ties} = 0;
+ $self->{is_locked} = 0;
+ $self->{string_to_journal} = '';
+
+ $self;
+}
+
+###########################################################################
+
+sub tie_db_readonly {
+ my ($self) = @_;
+
+ if (!HAS_DB_FILE) {
+ dbg ("bayes: DB_File module not installed, cannot use Bayes");
+ return 0;
+ }
+
+ # return if we've already tied to the db's, using the same mode
+ # (locked/unlocked) as before.
+ return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
+
+ my $main = $self->{bayes}->{main};
+ if (!defined($main->{conf}->{bayes_path})) {
+ dbg ("bayes_path not defined");
+ return 0;
+ }
+
+ $self->read_db_configs();
+
+ my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+ my $found=0;
+ for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
+
+ if (!$found) {
+ dbg ("bayes: no dbs present, cannot scan: ${path}_toks");
+ return 0;
+ }
+
+ foreach my $dbname (@DBNAMES) {
+ my $name = $path.'_'.$dbname;
+ my $db_var = 'db_'.$dbname;
+ dbg("bayes: $$ tie-ing to DB file R/O $name");
+ # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
+ tie %{$self->{$db_var}},"DB_File",$name, O_RDONLY,
+ (oct ($main->{conf}->{bayes_file_mode}) & 0666)
+ or goto failed_to_tie;
+ }
+
+ $self->{db_version} = ($self->get_storage_variables())[6];
+ dbg("bayes: found bayes db version ".$self->{db_version});
+
+ # If the DB version is one we don't understand, abort!
+ if ( $self->_check_db_version() ) {
+ dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
+ $self->untie_db();
+ return 0;
+ }
+
+ if ( $self->{db_version} < 2 ) { # older versions use scancount
+ $self->{scan_count_little_file} = $path.'_msgcount';
+ }
+
+ $self->{already_tied} = 1;
+ return 1;
+
+failed_to_tie:
+ warn "Cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
+ return 0;
+}
+
+# tie() to the databases, read-write and locked. Any callers of
+# this should ensure they call untie_db() afterwards!
+#
+sub tie_db_writable {
+ my ($self) = @_;
+
+ if (!HAS_DB_FILE) {
+ dbg ("bayes: DB_File module not installed, cannot use Bayes");
+ return 0;
+ }
+
+ # return if we've already tied to the db's, using the same mode
+ # (locked/unlocked) as before.
+ return 1 if ($self->{already_tied} && $self->{is_locked} == 1);
+
+ my $main = $self->{bayes}->{main};
+ if (!defined($main->{conf}->{bayes_path})) {
+ dbg ("bayes_path not defined");
+ return 0;
+ }
+
+ $self->read_db_configs();
+
+ my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+ my $found=0;
+ for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
+
+ my $parentdir = dirname ($path);
+ if (!-d $parentdir) {
+ # run in an eval(); if mkpath has no perms, it calls die()
+ eval {
+ mkpath ($parentdir, 0, (oct ($main->{conf}->{bayes_file_mode}) & 0777));
+ };
+ }
+
+ my $tout;
+ if ($main->{learn_wait_for_lock}) {
+ $tout = 300; # TODO: Dan to write better lock code
+ } else {
+ $tout = 10;
+ }
+ if ($main->{locker}->safe_lock ($path, $tout)) {
+ $self->{locked_file} = $path;
+ $self->{is_locked} = 1;
+ } else {
+ warn "Cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
+ return 0;
+ }
+
+ my $umask = umask 0;
+ foreach my $dbname (@DBNAMES) {
+ my $name = $path.'_'.$dbname;
+ my $db_var = 'db_'.$dbname;
+ dbg("bayes: $$ tie-ing to DB file R/W $name");
+ tie %{$self->{$db_var}},"DB_File",$name, O_RDWR|O_CREAT,
+ (oct ($main->{conf}->{bayes_file_mode}) & 0666)
+ or goto failed_to_tie;
+ }
+ umask $umask;
+
+ # set our cache to what version DB we're using
+ $self->{db_version} = ($self->get_storage_variables())[6];
+ dbg("bayes: found bayes db version ".$self->{db_version});
+
+ # figure out if we can read the current DB and if we need to do a
+ # DB version update and do it if necessary if either has a problem,
+ # fail immediately
+ #
+ if ( $found && $self->_upgrade_db() ) {
+ $self->untie_db();
+ return 0;
+ }
+ elsif ( !$found ) { # new DB, make sure we know that ...
+ $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+ $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
+ dbg("bayes: new db, set db version ".$self->{db_version}." and 0 tokens");
+ }
+
+ $self->{already_tied} = 1;
+ return 1;
+
+failed_to_tie:
+ my $err = $!;
+ umask $umask;
+ if ($self->{is_locked}) {
+ $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
+ $self->{is_locked} = 0;
+ }
+ warn "Cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
+ return 0;
+}
+
+# Do we understand how to deal with this DB version?
+sub _check_db_version {
+ my ($self) = @_;
+ my $db_ver = ($self->get_storage_variables())[6];
+
+ if ( $db_ver > $self->DB_VERSION ) { # current DB is newer, ignore the DB!
+ warn "bayes: Found DB Version $db_ver, but can only handle up to version ".$self->DB_VERSION."\n";
+ return 1;
+ }
+
+ return 0;
+}
+
+# Check to see if we need to upgrade the DB, and do so if necessary
+sub _upgrade_db {
+ my ($self) = @_;
+
+ return 0 if ( $self->{db_version} == $self->DB_VERSION );
+ if ( $self->_check_db_version() ) {
+ dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
+ return 1;
+ }
+
+ # If the current DB version is lower than the new version, upgrade!
+ # Do conversions in order so we can go 1 -> 3, make sure to update $self->{db_version}
+
+ dbg("bayes: detected bayes db format ".$self->{db_version}.", upgrading");
+
+ # since DB_File will not shrink a database (!!), we need to *create*
+ # a new one instead.
+ my $main = $self->{bayes}->{main};
+ my $path = $main->sed_path ($main->{conf}->{bayes_path});
+ my $name = $path.'_toks';
+
+ # older version's journal files are likely not in the same format as the new ones, so remove it.
+ my $jpath = $self->_get_journal_filename();
+ if ( -f $jpath ) {
+ dbg("bayes: old journal file found, removing.");
+ warn "Couldn't remove $jpath: $!" if ( !unlink $jpath );
+ }
+
+ if ( $self->{db_version} < 2 ) {
+ dbg ("bayes: upgrading database format from v".$self->{db_version}." to v2");
+
+ my($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
+ my($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
+
+ # Magic tokens for version 0, defined as '**[A-Z]+'
+ if ( $self->{db_version} == 0 ) {
+ $DB_NSPAM_MAGIC_TOKEN = '**NSPAM';
+ $DB_NHAM_MAGIC_TOKEN = '**NHAM';
+ $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
+ #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
+ #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
+ #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
+ #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE';
+ }
+ else {
+ $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
+ $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
+ $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
+ #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
+ #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
+ #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
+ #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
+ }
+
+ # remember when we started ...
+ my $started = time;
+ my $newatime = $started;
+
+ # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+ # anyway)
+ my %new_toks;
+ my $umask = umask 0;
+ tie %new_toks, "DB_File", "${name}.new", O_RDWR|O_CREAT|O_EXCL,
+ (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+ umask $umask;
+
+ # add the magic tokens to the new db.
+ $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
+ $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
+ $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
+ $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
+ $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+ $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
+ $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+ $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
+ $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
+ $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
+
+ my $magic_re = $self->get_magic_re($self->{db_version});
+
+ # deal with the data tokens
+ my ($tok, $packed);
+ while (($tok, $packed) = each %{$self->{db_toks}}) {
+ next if ($tok =~ /$magic_re/); # skip magic tokens
+
+ my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+ $new_toks{$tok} = $self->tok_pack ($ts, $th, $newatime);
+ }
+
+
+ # now untie so we can do renames
+ untie %{$self->{db_toks}};
+ untie %new_toks;
+
+ # This is the critical phase (moving files around), so don't allow
+ # it to be interrupted.
+ local $SIG{'INT'} = 'IGNORE';
+ local $SIG{'HUP'} = 'IGNORE';
+ local $SIG{'TERM'} = 'IGNORE';
+
+ # older versions used scancount, so kill the stupid little file ...
+ my $msgc = $path.'_msgcount';
+ if ( -f $msgc ) {
+ dbg("bayes: old msgcount file found, removing.");
+ if ( !unlink $msgc ) {
+ warn "Couldn't remove $msgc: $!";
+ }
+ }
+
+ # now rename in the new one. Try several extensions
+ for my $ext (@DB_EXTENSIONS) {
+ my $newf = $name.'.new'.$ext;
+ my $oldf = $name.$ext;
+ next unless (-f $newf);
+ if (!rename ($newf, $oldf)) {
+ warn "rename $newf to $oldf failed: $!\n";
+ return 1;
+ }
+ }
+
+ # re-tie to the new db in read-write mode ...
+ tie %{$self->{db_toks}},"DB_File", $name, O_RDWR|O_CREAT,
+ (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+
+ dbg ("bayes: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds");
+ $self->{db_version} = 2; # need this for other functions which check
+ }
+
+ # if ( $self->{db_version} == 2 ) {
+ # ...
+ # $self->{db_version} = 3; # need this for other functions which check
+ # }
+ # ... and so on.
+
+ return 0;
+}
+
+###########################################################################
+
+sub untie_db {
+ my $self = shift;
+ dbg("bayes: $$ untie-ing");
+
+ foreach my $dbname (@DBNAMES) {
+ my $db_var = 'db_'.$dbname;
+
+ if (exists $self->{$db_var}) {
+ dbg ("bayes: $$ untie-ing $db_var");
+ untie %{$self->{$db_var}};
+ delete $self->{$db_var};
+ }
+ }
+
+ if ($self->{is_locked}) {
+ dbg ("bayes: files locked, now unlocking lock");
+ $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
+ $self->{is_locked} = 0;
+ }
+
+ $self->{already_tied} = 0;
+ $self->{db_version} = undef;
+}
+
+###########################################################################
+
+sub calculate_expire_delta {
+ my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+
+ my %delta = (); # use a hash since an array is going to be very sparse
+
+ my $magic_re = $self->get_magic_re($self->DB_VERSION);
+
+ # do the first pass, figure out atime delta
+ my ($tok, $packed);
+ while (($tok, $packed) = each %{$self->{db_toks}}) {
+ next if ($tok =~ /$magic_re/); # skip magic tokens
+
+ my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+ # Go through from $start * 1 to $start * 512, mark how many tokens we would expire
+ my $token_age = $newest_atime - $atime;
+ for( my $i = 1; $i <= $max_expire_mult; $i<<=1 ) {
+ if ( $token_age >= $start * $i ) {
+ $delta{$i}++;
+ }
+ else {
+ # If the token age is less than the expire delta, it'll be
+ # less for all upcoming checks too, so abort early.
+ last;
+ }
+ }
+ }
+ return %delta;
+}
+
+###########################################################################
+
+sub token_expiration {
+ my ($self, $opts, $newdelta, @vars) = @_;
+
+ my $deleted = 0;
+ my $kept = 0;
+ my $num_hapaxes = 0;
+ my $num_lowfreq = 0;
+
+ # since DB_File will not shrink a database (!!), we need to *create*
+ # a new one instead.
+ my $main = $self->{bayes}->{main};
+ my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+ # use a temporary PID-based suffix just in case another one was
+ # created previously by an interrupted expire
+ my $tmpsuffix = "expire$$";
+ my $tmpdbname = $path.'_toks.'.$tmpsuffix;
+
+ # clean out any leftover db copies from previous runs
+ for my $ext (@DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
+
+ # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+ # anyway)
+ my %new_toks;
+ my $umask = umask 0;
+ tie %new_toks, "DB_File", $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
+ (oct ($main->{conf}->{bayes_file_mode}) & 0666);
+ umask $umask;
+ my $oldest;
+
+ my $showdots = $opts->{showdots};
+ if ($showdots) { print STDERR "\n"; }
+
+ # We've chosen a new atime delta if we've gotten here, so record it for posterity.
+ $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
+
+ # Figure out how old is too old...
+ my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
+
+ my $magic_re = $self->get_magic_re($self->DB_VERSION);
+
+ # Go ahead and do the move to new db/expire run now ...
+ my ($tok, $packed);
+ while (($tok, $packed) = each %{$self->{db_toks}}) {
+ next if ($tok =~ /$magic_re/); # skip magic tokens
+
+ my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+ if ($atime < $too_old) {
+ $deleted++;
+ } else {
+ $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
+ if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
+ if ($ts + $th == 1) {
+ $num_hapaxes++;
+ } elsif ($ts < 8 && $th < 8) {
+ $num_lowfreq++;
+ }
+ }
+
+ if ((($kept + $deleted) % 1000) == 0) {
+ if ($showdots) { print STDERR "."; }
+ $self->set_running_expire_tok();
+ }
+ }
+
+ # and add the magic tokens. don't add the expire_running token.
+ $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+
+ # We haven't changed messages of each type seen, so just copy over.
+ $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1];
+ $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2];
+
+ # We magically haven't removed the newest token, so just copy that value over.
+ $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10];
+
+ # The rest of these have been modified, so replace as necessary.
+ $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
+ $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+ $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
+ $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
+
+ # now untie so we can do renames
+ untie %{$self->{db_toks}};
+ untie %new_toks;
+
+ # This is the critical phase (moving files around), so don't allow
+ # it to be interrupted. Scope the signal changes.
+ {
+ local $SIG{'INT'} = 'IGNORE';
+ local $SIG{'HUP'} = 'IGNORE';
+ local $SIG{'TERM'} = 'IGNORE';
+
+ # now rename in the new one. Try several extensions
+ for my $ext (@DB_EXTENSIONS) {
+ my $newf = $tmpdbname.$ext;
+ my $oldf = $path.'_toks'.$ext;
+ next unless (-f $newf);
+ if (!rename ($newf, $oldf)) {
+ warn "rename $newf to $oldf failed: $!\n";
+ }
+ }
+ }
+
+ # Call untie_db() so we unlock correctly.
+ $self->untie_db();
+
+ return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
+}
+
+###########################################################################
+
+# Is a sync due?
+sub sync_due {
+ my ($self) = @_;
+
+ return 0 if ( $self->{db_version} < $self->DB_VERSION ); # don't bother doing old db versions
+
+ my $conf = $self->{bayes}->{main}->{conf};
+ return 0 if ( $conf->{bayes_journal_max_size} == 0 );
+
+ my @vars = $self->get_storage_variables();
+ dbg("Bayes DB journal sync: last sync: ".$vars[7],'bayes','-1');
+
+ ## Ok, should we do a sync?
+
+ # Not if the journal file doesn't exist, it's not a file, or it's 0 bytes long.
+ return 0 unless (stat($self->_get_journal_filename()) && -f _);
+
+ # Yes if the file size is larger than the specified maximum size.
+ return 1 if (-s _ > $conf->{bayes_journal_max_size});
+
+ # Yes if it's been at least a day since the last sync.
+ return 1 if (time - $vars[7] > 86400);
+
+ # No, I guess not.
+ return 0;
+}
+
+###########################################################################
+# db_seen reading APIs
+
+sub seen_get {
+ my ($self, $msgid) = @_;
+ $self->{db_seen}->{$msgid};
+}
+
+sub seen_put {
+ my ($self, $msgid, $seen) = @_;
+
+ if ($self->{bayes}->{main}->{learn_to_journal}) {
+ $self->defer_update ("m $seen $msgid");
+ }
+ else {
+ $self->{db_seen}->{$msgid} = $seen;
+ }
+}
+
+sub seen_delete {
+ my ($self, $msgid) = @_;
+
+ if ($self->{bayes}->{main}->{learn_to_journal}) {
+ $self->defer_update ("m f $msgid");
+ }
+ else {
+ delete $self->{db_seen}->{$msgid};
+ }
+}
+
+###########################################################################
+# db reading APIs
+
+sub tok_get {
+ my ($self, $tok) = @_;
+ $self->tok_unpack ($self->{db_toks}->{$tok});
+}
+
+# return the magic tokens in a specific order:
+# 0: scan count base
+# 1: number of spam
+# 2: number of ham
+# 3: number of tokens in db
+# 4: last expire atime
+# 5: oldest token in db atime
+# 6: db version value
+# 7: last journal sync
+# 8: last atime delta
+# 9: last expire reduction count
+# 10: newest token in db atime
+#
+sub get_storage_variables {
+ my ($self) = @_;
+ my @values;
+
+ my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
+ if ( !$db_ver || $db_ver =~ /\D/ ) { $db_ver = 0; }
+
+ if ( $db_ver == 0 ) {
+ my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
+ my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
+ my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
+ my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
+ my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
+ my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
+
+ @values = (
+ $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ );
+ }
+ elsif ( $db_ver == 1 ) {
+ my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
+ my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
+ my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
+ my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
+ my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
+ my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
+
+ @values = (
+ $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ );
+ }
+ elsif ( $db_ver == 2 ) {
+ my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
+ my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
+ my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+ my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
+ my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
+ my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
+ my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
+ my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
+ my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
+ my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
+
+ @values = (
+ 0,
+ $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+ 2,
+ $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
+ $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
+ );
+ }
+
+
+ foreach ( @values ) {
+ if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
+ }
+
+ return @values;
+}
+
+sub dump_db_toks {
+ my ($self, $template, $regex, @vars) = @_;
+
+ my $magic_re = $self->get_magic_re($self->{db_version});
+
+ foreach my $tok (keys %{$self->{db_toks}}) {
+ next if ($tok =~ /$magic_re/); # skip magic tokens
+ next if (defined $regex && ($tok !~ /$regex/o));
+
+ my ($ts, $th, $atime) = $self->tok_get ($tok);
+
+ my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2],
+ $ts, $th, $atime);
+ $prob ||= 0.5;
+
+ printf $template,$prob,$ts,$th,$atime,$tok;
+ }
+}
+
+sub set_last_expire {
+ my ($self, $time) = @_;
+ $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+## Don't bother using get_magic_tokens here. This token should only
+## ever exist when we're running expire, so we don't want to convert it if
+## it's there and we're not expiring ...
+sub get_running_expire_tok {
+ my ($self) = @_;
+ my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+ if (!$running || $running =~ /\D/) { return undef; }
+ return $running;
+}
+
+sub set_running_expire_tok {
+ my ($self) = @_;
+ $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+sub remove_running_expire_tok {
+ my ($self) = @_;
+ delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+}
+
+###########################################################################
+
+# db abstraction: allow deferred writes, since we will be frequently
+# writing while checking.
+
+sub tok_count_change {
+ my ($self, $ds, $dh, $tok, $atime) = @_;
+
+ $atime = 0 unless defined $atime;
+
+ if ($self->{bayes}->{main}->{learn_to_journal}) {
+ $self->defer_update ("c $ds $dh $atime $tok");
+ } else {
+ $self->tok_sync_counters ($ds, $dh, $atime, $tok);
+ }
+}
+
+sub nspam_nham_get {
+ my ($self) = @_;
+ my @vars = $self->get_storage_variables();
+ ($vars[1], $vars[2]);
+}
+
+sub nspam_nham_change {
+ my ($self, $ds, $dh) = @_;
+
+ if ($self->{bayes}->{main}->{learn_to_journal}) {
+ $self->defer_update ("n $ds $dh");
+ } else {
+ $self->tok_sync_nspam_nham ($ds, $dh);
+ }
+}
+
+sub tok_touch {
+ my ($self, $tok, $atime) = @_;
+ $self->defer_update ("t $atime $tok");
+}
+
+sub defer_update {
+ my ($self, $str) = @_;
+ $self->{string_to_journal} .= "$str\n";
+}
+
+###########################################################################
+
+sub cleanup {
+ my ($self) = @_;
+
+ my $nbytes = length ($self->{string_to_journal});
+ return if ($nbytes == 0);
+
+ my $path = $self->_get_journal_filename();
+
+ # use append mode, write atomically, then close, so simultaneous updates are
+ # not lost
+ my $conf = $self->{bayes}->{main}->{conf};
+ my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
+ if (!open (OUT, ">>".$path)) {
+ warn "cannot write to $path, Bayes db update ignored\n";
+ umask $umask; # reset umask
+ return;
+ }
+
+ # do not use print() here, it will break up the buffer if it's >8192 bytes,
+ # which could result in two sets of tokens getting mixed up and their
+ # touches missed.
+ my $writ = 0;
+ while ($writ < $nbytes) {
+ my $len = syswrite (OUT, $self->{string_to_journal}, $nbytes-$writ);
+
+ if (!defined $len || $len < 0) {
+ # argh, write failure, give up
+ $len = 0 unless ( defined $len );
+ warn "write failed to Bayes journal $path ($len of $nbytes)!\n";
+ last;
+ }
+
+ $writ += $len;
+ if ($len < $nbytes) {
+ # this should not happen on filesystem writes! Still, try to recover
+ # anyway, but be noisy about it so the admin knows
+ warn "partial write to Bayes journal $path ($len of $nbytes), recovering.\n";
+ $self->{string_to_journal} = substr ($self->{string_to_journal}, $len);
+ }
+ }
+
+ if (!close OUT) {
+ warn "cannot write to $path, Bayes db update ignored\n";
+ }
+ umask $umask; # reset umask
+
+ $self->{string_to_journal} = '';
+}
+
+# Return a qr'd RE to match a token with the correct format's magic token
+sub get_magic_re {
+ my ($self, $db_ver) = @_;
+
+ $db_ver = $self->DB_VERSION if (!$db_ver); # XXX - not sure how good of a thing this is
+
+ if ( $db_ver >= 1 ) {
+ return qr/^\015\001\007\011\003/;
+ }
+
+ # When in doubt, assume v0
+ return qr/^\*\*[A-Z]+$/;
+}
+
+sub is_magic_token {
+ my ($self, $token) = @_;
+
+ my $magic_re = $self->get_magic_re($self->{db_version});
+
+ return ($token =~ /$magic_re/);
+}
+
+# provide a more generalized public insterface into the journal sync
+
+sub sync {
+ my ($self, $opts) = @_;
+
+ return $self->_sync_journal($opts);
+}
+
+###########################################################################
+# And this method reads the journal and applies the changes in one
+# (locked) transaction.
+
+sub _sync_journal {
+ my ($self, $opts) = @_;
+ my $ret = 0;
+
+ my $path = $self->_get_journal_filename();
+
+ # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
+ if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+
+ eval {
+ local $SIG{'__DIE__'}; # do not run user die() traps in here
+ if ($self->tie_db_writable()) {
+ $ret = $self->_sync_journal_trapped($opts, $path);
+ }
+ };
+ my $err = $@;
+
+ # ok, untie from write-mode if we can
+ if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
+ $self->untie_db();
+ }
+
+ # handle any errors that may have occurred
+ if ($err) {
+ warn "bayes: $err\n";
+ return 0;
+ }
+
+ $ret;
+}
+
+sub _sync_journal_trapped {
+ my ($self, $opts, $path) = @_;
+
+ # Flag that we're doing work
+ $self->set_running_expire_tok();
+
+ my $started = time();
+ my $count = 0;
+ my $total_count = 0;
+ my %tokens = ();
+ my $showdots = $opts->{showdots};
+ my $retirepath = $path.".old";
+
+ # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
+ # we have to check again since the file may have been removed by a recent bayes db upgrade ...
+ if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+
+ if (!-r $path) { # will we be able to read the file?
+ warn "bayes: bad permissions on journal, can't read: $path\n";
+ return 0;
+ }
+
+ # This is the critical phase (moving files around), so don't allow
+ # it to be interrupted.
+ {
+ local $SIG{'INT'} = 'IGNORE';
+ local $SIG{'HUP'} = 'IGNORE';
+ local $SIG{'TERM'} = 'IGNORE';
+
+ # retire the journal, so we can update the db files from it in peace.
+ # TODO: use locking here
+ if (!rename ($path, $retirepath)) {
+ warn "bayes: failed rename $path to $retirepath\n";
+ return 0;
+ }
+
+ # now read the retired journal
+ if (!open (JOURNAL, "<$retirepath")) {
+ warn "bayes: cannot open read $retirepath\n";
+ return 0;
+ }
+
+
+ # Read the journal
+ while (<JOURNAL>) {
+ $total_count++;
+
+ if (/^t (\d+) (.*)$/) { # Token timestamp update, cache resultant entries
+ $tokens{$2} = $1+0 if ( !exists $tokens{$2} || $1+0 > $tokens{$2} );
+ } elsif (/^c (-?\d+) (-?\d+) (\d+) (.*)$/) { # Add/full token update
+ $self->tok_sync_counters ($1+0, $2+0, $3+0, $4);
+ $count++;
+ } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
+ $self->tok_sync_nspam_nham ($1+0, $2+0);
+ $count++;
+ } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
+ if ( $1 eq "f" ) {
+ $self->seen_delete($2);
+ }
+ else {
+ $self->seen_put($2,$1);
+ }
+ $count++;
+ } else {
+ warn "Bayes journal: gibberish entry found: $_";
+ }
+ }
+ close JOURNAL;
+
+ # Now that we've determined what tokens we need to update and their
+ # final values, update the DB. Should be much smaller than the full
+ # journal entries.
+ while( my($k,$v) = each %tokens ) {
+ $self->tok_touch_token ($v, $k);
+
+ if ((++$count % 1000) == 0) {
+ if ($showdots) { print STDERR "."; }
+ $self->set_running_expire_tok();
+ }
+ }
+
+ if ($showdots) { print STDERR "\n"; }
+
+ # we're all done, so unlink the old journal file
+ unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
+
+ $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
+
+ my $done = time();
+ my $msg = ("synced Bayes databases from journal in ".($done - $started).
+ " seconds: $count unique entries ($total_count total entries)");
+
+ if ($opts->{verbose}) {
+ print $msg,"\n";
+ } else {
+ dbg ($msg);
+ }
+ }
+
+ # else, that's the lot, we're synced. return
+ 1;
+}
+
+sub tok_touch_token {
+ my ($self, $atime, $tok) = @_;
+ my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+
+ # If the new atime is < the old atime, ignore the update
+ # We figure that we'll never want to lower a token atime, so abort if
+ # we try. (journal out of sync, etc.)
+ return if ( $oldatime >= $atime );
+
+ $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_sync_counters {
+ my ($self, $ds, $dh, $atime, $tok) = @_;
+ my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+ $ts += $ds; if ($ts < 0) { $ts = 0; }
+ $th += $dh; if ($th < 0) { $th = 0; }
+
+ # Don't roll the atime of tokens backwards ...
+ $atime = $oldatime if ( $oldatime > $atime );
+
+ $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_put {
+ my ($self, $tok, $ts, $th, $atime) = @_;
+ $ts ||= 0;
+ $th ||= 0;
+
+ if ( $tok =~ /^\015\001\007\011\003/ ) { # magic token? Ignore it!
+ return;
+ }
+
+ # use defined() rather than exists(); the latter is not supported
+ # by NDBM_File, believe it or not. Using defined() did not
+ # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
+ my $exists_already = defined $self->{db_toks}->{$tok};
+
+ if ($ts == 0 && $th == 0) {
+ return if (!$exists_already); # If the token doesn't exist, just return
+ $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
+ delete $self->{db_toks}->{$tok};
+ } else {
+ if (!$exists_already) { # If the token doesn't exist, raise the token count
+ $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
+ }
+
+ $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
+
+ my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
+ if (!defined ($newmagic) || $atime > $newmagic) {
+ $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+ }
+
+ # Make sure to check for either !defined or "" ... Apparently
+ # sometimes the DB module doesn't return the value correctly. :(
+ my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
+ if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
+ $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+ }
+ }
+}
+
+sub tok_sync_nspam_nham {
+ my ($self, $ds, $dh) = @_;
+ my ($ns, $nh) = ($self->get_storage_variables())[1,2];
+ if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
+ if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
+ $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
+ $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
+}
+
+###########################################################################
+
+sub _get_journal_filename {
+ my ($self) = @_;
+
+ if (defined $self->{journal_live_path}) {
+ return $self->{journal_live_path};
+ }
+
+ my $main = $self->{bayes}->{main};
+ my $fname = $main->sed_path ($main->{conf}->{bayes_path}."_journal");
+
+ $self->{journal_live_path} = $fname;
+ return $self->{journal_live_path};
+}
+
+###########################################################################
+
+sub scan_count_get {
+ my ($self) = @_;
+
+ if ( $self->{db_version} < 2 ) {
+ my ($count) = $self->get_storage_variables();
+ my $path = $self->{scan_count_little_file};
+ $count += (defined $path && -e $path ? -s _ : 0);
+ return $count;
+ }
+
+ 0;
+}
+
+###########################################################################
+
+# this is called directly from sa-learn(1).
+sub perform_upgrade {
+ my ($self, $opts) = @_;
+ my $ret = 0;
+
+ eval {
+ local $SIG{'__DIE__'}; # do not run user die() traps in here
+
+ use File::Basename;
+ use File::Copy;
+
+ # bayes directory
+ my $main = $self->{bayes}->{main};
+ my $path = $main->sed_path($main->{conf}->{bayes_path});
+ my $dir = dirname($path);
+
+ # make temporary copy since old dbm and new dbm may have same name
+ opendir(DIR, $dir) || die "can't opendir $dir: $!";
+ my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
+ closedir(DIR);
+ if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
+ {
+ die "unable to find bayes_toks and bayes_seen, stopping\n";
+ }
+ # untaint @files (already safe after grep)
+ @files = map { /(.*)/, $1 } @files;
+
+ for (@files) {
+ my $src = "$dir/$_";
+ my $dst = "$dir/old_$_";
+ copy($src, $dst) || die "can't copy $src to $dst: $!\n";
+ }
+
+ # delete previous to make way for import
+ for (@files) { unlink("$dir/$_"); }
+
+ # import
+ if ($self->tie_db_writable()) {
+ $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
+ $self->{db_seen});
+ $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
+ $self->{db_toks});
+ }
+
+ if ($ret == 2) {
+ print "import successful, original files saved with \"old\" prefix\n";
+ }
+ else {
+ print "import failed, original files saved with \"old\" prefix\n";
+ }
+ };
+ my $err = $@;
+
+ $self->untie_db();
+
+ # if we died, untie the dbm files
+ if ($err) {
+ warn "bayes perform_upgrade: $err\n";
+ return 0;
+ }
+ $ret;
+}
+
+sub upgrade_old_dbm_files_trapped {
+ my ($self, $filename, $output) = @_;
+
+ my $count;
+ my %in;
+
+ print "upgrading to DB_File, please be patient: $filename\n";
+
+ # try each type of file until we find one with > 0 entries
+ for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
+ $count = 0;
+ # wrap in eval so it doesn't run in general use. This accesses db
+ # modules directly.
+ # Note: (bug 2390), the 'use' needs to be on the same line as the eval
+ # for RPM dependency checks to work properly. It's lame, but...
+ eval 'use ' . $dbm . ';
+ tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
+ %{ $output } = %in;
+ $count = scalar keys %{ $output };
+ untie %in;
+ ';
+ if ($@) {
+ print "$dbm: $dbm module not installed, nothing copied.\n";
+ dbg("error was: $@");
+ }
+ elsif ($count == 0) {
+ print "$dbm: no database of that kind found, nothing copied.\n";
+ }
+ else {
+ print "$dbm: copied $count entries.\n";
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+###########################################################################
+
+# token marshalling format for db_toks.
+
+# Since we may have many entries with few hits, especially thousands of hapaxes
+# (1-occurrence entries), use a flexible entry format, instead of simply "2
+# packed ints", to keep the memory and disk space usage down. In my
+# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
+# can use a 1-byte representation for the other 91% of low-hitting entries
+# and save masses of space.
+
+# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
+# ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte
+# representation; otherwise, if XX in the first byte is 00, it's packed as
+# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
+
+# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
+
+use constant FORMAT_FLAG => 0xc0; # 11000000
+use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
+use constant TWO_LONGS_FORMAT => 0x00; # 00000000
+
+use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
+use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
+
+sub tok_unpack {
+ my ($self, $value) = @_;
+ $value ||= 0;
+
+ my ($packed, $atime);
+ if ( $self->{db_version} == 0 ) {
+ ($packed, $atime) = unpack("CS", $value);
+ }
+ elsif ( $self->{db_version} == 1 || $self->{db_version} == 2 ) {
+ ($packed, $atime) = unpack("CV", $value);
+ }
+
+ if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
+ return (($packed & ONE_BYTE_SSS_BITS) >> 3,
+ $packed & ONE_BYTE_HHH_BITS,
+ $atime || 0);
+ }
+ elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
+ my ($packed, $ts, $th, $atime);
+ if ( $self->{db_version} == 0 ) {
+ ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
+ }
+ elsif ( $self->{db_version} == 1 ) {
+ ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+ }
+ elsif ( $self->{db_version} == 2 ) {
+ ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+ }
+ return ($ts || 0, $th || 0, $atime || 0);
+ }
+ # other formats would go here...
+ else {
+ warn "unknown packing format for Bayes db, please re-learn: $packed";
+ return (0, 0, 0);
+ }
+}
+
+sub tok_pack {
+ my ($self, $ts, $th, $atime) = @_;
+ $ts ||= 0; $th ||= 0; $atime ||= 0;
+ if ($ts < 8 && $th < 8) {
+ return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
+ } else {
+ return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
+ }
+}
+
+###########################################################################
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+sub sa_die { Mail::SpamAssassin::sa_die (@_); }
+
+1;

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm Thu Jan 29 18:54:33 2004
@@ -0,0 +1,1505 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+#
+# Copyright (c) 2000 The Apache Software Foundation. All rights
+# reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. The end-user documentation included with the redistribution,
+# if any, must include the following acknowledgment:
+# "This product includes software developed by the
+# Apache Software Foundation (http://www.apache.org/)."
+# Alternately, this acknowledgment may appear in the software itself,
+# if and wherever such third-party acknowledgments normally appear.
+#
+# 4. The names "Apache" and "Apache Software Foundation" must
+# not be used to endorse or promote products derived from this
+# software without prior written permission. For written
+# permission, please contact apache@apache.org.
+#
+# 5. Products derived from this software may not be called "Apache",
+# nor may "Apache" appear in their name, without prior written
+# permission of the Apache Software Foundation.
+#
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+#
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation. For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+#
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::BayesStoreSQL - SQL Bayesian Storage Module Implementation
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This module implementes a SQL based bayesian storage module.
+
+=cut
+
+package Mail::SpamAssassin::BayesStoreSQL;
+
+use strict;
+use bytes;
+
+use DBI;
+
+use Mail::SpamAssassin::BayesStore;
+
+use vars qw( @ISA );
+
+@ISA = qw( Mail::SpamAssassin::BayesStore );
+
+=head1 METHODS
+
+=head2 new
+
+public class (Mail::SpamAssassin::BayesStoreSQL) new (Mail::Spamassassin::Bayes $bayes)
+
+Description:
+This methods creates a new instance of the Mail::SpamAssassin::BayesStoreSQL
+object. It expects to be passed an instance of the Mail::SpamAssassin:Bayes
+object which is passed into the Mail::SpamAssassin::BayesStore parent object.
+
+This method sets up the database connection and determines the username to
+use in queries.
+
+=cut
+
+sub new {
+ my $class = shift;
+ $class = ref($class) || $class;
+
+ my $self = $class->SUPER::new(@_);
+
+ $self->{supported_db_version} = 2;
+
+ if (!$self->{bayes}->{conf}->{bayes_sql_dsn}) {
+ dbg("bayes: invalid config, must set bayes_sql_dsn config variable.\n");
+ return undef;
+ }
+
+ my $dsn = $self->{bayes}->{conf}->{bayes_sql_dsn};
+ my $dbuser = $self->{bayes}->{conf}->{bayes_sql_username};
+ my $dbpass = $self->{bayes}->{conf}->{bayes_sql_password};
+
+ my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {'PrintError' => 1});
+
+ if (!$dbh) {
+ dbg("bayes: Unable to connect to database: ".DBI->errstr());
+
+ ## TODO! This is not appropriate -- $bayes->{store} must alwasy
+ ## be a valid object. returning undef from a constructor is bad
+ ## news.
+ return undef;
+ }
+
+ $self->{_dbh} = $dbh;
+
+ dbg("bayes: Database connection established");
+
+ if ($self->{bayes}->{conf}->{bayes_sql_override_username}) {
+ $self->{_username} = $self->{bayes}->{conf}->{bayes_sql_override_username};
+ }
+ else {
+ $self->{_username} = $self->{bayes}->{main}->{username};
+
+ # Need to make sure that a username is set, so just in case there is
+ # no username set in main, set one here.
+ unless ($self->{_username}) {
+ $self->{_username} = "GLOBALBAYES";
+ }
+ }
+ dbg("bayes: Using username: ".$self->{_username});
+ return $self;
+}
+
+=head2 tie_db_readonly
+
+public instance (Boolean) tie_db_readonly ();
+
+Description:
+This method ensures that the database connection is properly setup
+and working. If necessary it will initialize a user's bayes variables
+so that they can begin using the database immediately.
+
+=cut
+
+sub tie_db_readonly {
+ my ($self) = @_;
+
+ my $ret = $self->tie_db_writable();
+
+ return $ret;
+}
+
+=head2 tie_db_writable
+
+public instance (Boolean) tie_db_writable ()
+
+Description:
+This method ensures that the database connetion is properly setup
+and working. If necessary it will initialize a users bayes variables
+so that they can begin using the database immediately.
+
+=cut
+
+sub tie_db_writable {
+ my ($self) = @_;
+ my $main = $self->{bayes}->{main};
+
+ $self->read_db_configs();
+
+ # If the DB version is one we don't understand, abort!
+ my $db_ver = $self->_get_db_version();
+ $self->{db_version} = $db_ver;
+ dbg("bayes: found bayes db version ".$self->{db_version});
+
+ if ( $db_ver != $self->DB_VERSION ) {
+ dbg("bayes: Database version $db_ver is different than we understand (".$self->DB_VERSION."), aborting!");
+ $self->untie_db();
+ return 0;
+ }
+
+ unless ($self->_initialize_db()) {
+ dbg("bayes: unable to initialize database for ".$self->{_username}." user, aborting!");
+ $self->untie_db();
+ return 0;
+ }
+
+ return 1;
+}
+
+
+=head2 untie_db
+
+public instance () untie_db ()
+
+Description:
+This method is unused for the SQL based implementation.
+
+=cut
+
+sub untie_db {
+ my ($self) = @_;
+ # not used for SQL based implementation
+}
+
+=head2 calculate_expire_delta
+
+public instance (\%) calculate_expire_delta (Integer $newest_atime,
+ Integer $start,
+ Integer $max_expire_mult)
+
+Description:
+This method performs a calculation on the data to determine the optimum
+atime for token expiration.
+
+=cut
+
+sub calculate_expire_delta {
+ my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+
+ my %delta = (); # use a hash since an array is going to be very sparse
+
+ my $sql = "SELECT count(*)
+ FROM bayes_token
+ WHERE username = ?
+ AND (? - atime) > ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
+ my $rc = $sth->execute($self->{_username}, $newest_atime, $start * $i);
+
+ unless ($rc) {
+ dbg("bayes: calculate_expire_delta: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+
+ my ($count) = $sth->fetchrow_array();
+
+ $delta{$i} = $count;
+ }
+ $sth->finish();
+
+ return %delta;
+}
+
+=head2 token_expiration
+
+public instance (Integer, Integer,
+ Integer, Integer) token_expiration(\% $opts,
+ Integer $newdelta,
+ @ @vars)
+
+Description:
+This method performs the database specific expiration of tokens based on
+the passed in C<$newdelta> and C<@vars>.
+
+=cut
+
+sub token_expiration {
+ my ($self, $opts, $newdelta, @vars) = @_;
+
+ my $num_hapaxes;
+ my $num_lowfreq;
+
+ # Figure out how old is too old...
+ my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
+
+ my $sql = "DELETE from bayes_token WHERE username = ? and atime < ?";
+
+ my $rows = $self->{_dbh}->do($sql, undef, $self->{_username}, $too_old);
+
+ if (!defined($rows)) {
+ dbg("bayes: actual_expire: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $deleted = $rows;
+
+ # We've chosen a new atime delta if we've gotten here, so record it for posterity.
+ $self->_set_last_atime_delta($newdelta);
+
+ # The rest of these have been modified, so replace as necessary.
+ $self->set_last_expire(time());
+ $self->_set_last_expire_reduce($deleted);
+
+ # Call untie_db() first so we unlock correctly etc. first
+ $self->untie_db();
+
+ my $kept = $self->_get_token_count();
+
+ $num_hapaxes = $self->_get_num_hapaxes() if ($opts->{verbose});
+ $num_lowfreq = $self->_get_num_lowfreq() if ($opts->{verbose});
+
+ return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
+}
+
+=head2 sync_due
+
+public instance (Boolean) sync_due ()
+
+Description:
+This method determines if a database sync is currently required.
+
+Unused for SQL based implementation.
+
+=cut
+
+sub sync_due {
+ my ($self) = @_;
+
+ return 0;
+}
+
+=head2 seen_get
+
+public instance (String) seen_get (string $msgid)
+
+Description:
+This method retrieves the stored value, if any, for C<$msgid>. The return value
+is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid> is not
+found.
+
+=cut
+
+sub seen_get {
+ my ($self, $msgid) = @_;
+
+ my $sql = "SELECT flag FROM bayes_seen WHERE username = ? AND msgid = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: seen_get: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+
+ my $rc = $sth->execute($self->{_username}, $msgid);
+
+ unless ($rc) {
+ dbg("bayes: seen_get: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+
+ my ($flag) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $flag;
+}
+
+=head2 seen_put
+
+public (Boolean) seen_put (string $msgid, char $flag)
+
+Description:
+This method records C<$msgid> as the type given by C<$flag>. C<$flag> is one of
+two values 's' for spam and 'h' for ham.
+
+=cut
+
+sub seen_put {
+ my ($self, $msgid, $flag) = @_;
+
+ return 0 if (!$msgid);
+ return 0 if (!$flag);
+
+ my $sql = "INSERT INTO bayes_seen (username, msgid, flag) VALUES (?,?,?)";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username}, $msgid, $flag);
+
+ unless ($rc) {
+ dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ dbg("bayes: seen ($msgid) put");
+ return 1;
+}
+
+=head2 seen_delete
+
+public instance (Boolean) seen_delete (string $msgid)
+
+Description:
+This method removes C<$msgid> from the database.
+
+=cut
+
+sub seen_delete {
+ my ($self, $msgid) = @_;
+
+ return 0 if (!$msgid);
+
+ my $sql = "DELETE FROM bayes_seen WHERE username = ? AND msgid = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username}, $msgid);
+
+ unless ($rc) {
+ dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 get_storage_variables
+
+public instance (@) get_storage_variables ()
+
+Description:
+This method retrieves the various administrative variables used by
+the Bayes process and database.
+
+The values returned in the array are in the following order:
+
+0: scan count base
+
+1: number of spam
+
+2: number of ham
+
+3: number of tokens in db
+
+4: last expire atime
+
+5: oldest token in db atime
+
+6: db version value
+
+7: last journal sync
+
+8: last atime delta
+
+9: last expire reduction count
+
+10: newest token in db atime
+
+=cut
+
+sub get_storage_variables {
+ my ($self) = @_;
+ my @values;
+
+ my $sql = "SELECT spam_count, ham_count, last_expire,
+ last_atime_delta, last_expire_reduce
+ FROM bayes_vars
+ WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: get_storage_variables: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0,0,0,0,0,0,0,0,0,0);
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: get_storage_variables: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0,0,0,0,0,0,0,0,0,0);
+ }
+
+ my ($spam_count, $ham_count, $last_expire,
+ $last_atime_delta, $last_expire_reduce) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ my $token_count = $self->_get_token_count();
+ my $oldest_token_age = $self->_get_oldest_token_age();
+ my $newest_token_age = $self->_get_newest_token_age();
+ my $db_ver = $self->DB_VERSION;
+
+ @values = (
+ 0,
+ $spam_count,
+ $ham_count,
+ $token_count,
+ $last_expire,
+ $oldest_token_age,
+ $db_ver,
+ 0, # we do not do journal syncs
+ $last_atime_delta,
+ $last_expire_reduce,
+ $newest_token_age
+ );
+
+ foreach ( @values ) {
+ if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
+ }
+
+ return @values;
+}
+
+=head2 dump_db_toks
+
+public instance () dump_db_toks (String $template, String $regex, Array @vars)
+
+Description:
+This method loops over all tokens, computing the probability for the token and then
+printing it out according to the passed in token.
+
+=cut
+
+sub dump_db_toks {
+ my ($self, $template, $regex, @vars) = @_;
+
+ # 0/0 tokens don't count
+ # since ordering is check here, order the tokens
+ my $sql = "SELECT token, spam_count, ham_count, atime
+ FROM bayes_token
+ WHERE username = ?
+ AND (spam_count > 0 OR ham_count > 0)
+ ORDER BY token";
+
+ my $sth = $self->{_dbh}->prepare($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: dump_db_toks: SQL Error: ".$self->{_dbh}->errstr());
+ return;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: dump_db_toks: SQL Error: ".$self->{_dbh}->errstr());
+ return;
+ }
+
+ while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) {
+ my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2],
+ $spam_count, $ham_count,
+ $atime);
+ $prob ||= 0.5;
+
+ printf $template,$prob,$spam_count,$ham_count,$atime,$token;
+ }
+
+ $sth->finish();
+
+ return;
+}
+
+=head2 set_last_expire
+
+public instance (Boolean) set_last_expire (Integer $time)
+
+Description:
+This method sets the last expire time.
+
+=cut
+
+sub set_last_expire {
+ my ($self, $time) = @_;
+
+ return 0 unless (defined($time));
+
+ my $sql = "UPDATE bayes_vars SET last_expire = ? WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($time, $self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 get_running_expire_tok
+
+public instance (String $time) get_running_expire_tok ()
+
+Description:
+This method determines if an expire is currently running and returns
+the last time set.
+
+There can be multiple times, so we just pull the greatest (most recent)
+value.
+
+=cut
+
+sub get_running_expire_tok {
+ my ($self) = @_;
+
+ my $sql = "SELECT max(runtime) from bayes_expire WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: get_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: get_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($runtime) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $runtime;
+}
+
+=head2 set_running_expire_tok
+
+public instance (String $time) set_running_expire_tok ()
+
+Description:
+This method sets the time that an expire starts running.
+
+=cut
+
+sub set_running_expire_tok {
+ my ($self) = @_;
+
+ my $sql = "INSERT INTO bayes_expire (username,runtime) VALUES (?,?)";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ my $time = time();
+
+ my $rc = $sth->execute($self->{_username}, $time);
+
+ unless ($rc) {
+ dbg("bayes: set_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+ $sth->finish();
+ return $time;
+}
+
+=head2 remove_running_expire_tok
+
+public instance (Boolean) remove_running_expire_tok ()
+
+Description:
+This method removes the row in the database that indicates that
+and expire is currently running.
+
+=cut
+
+sub remove_running_expire_tok {
+ my ($self) = @_;
+
+ my $sql = "DELETE from bayes_expire WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: remove_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rows = $self->{_dbh}->do($sql, undef, $self->{_username});
+
+ if (!defined($rows)) {
+ dbg("bayes: remove_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ return 1;
+}
+=head2 tok_get
+
+public instance (Integer, Integer, Integer) tok_get (String $token)
+
+Description:
+This method retrieves a specificed token (C<$token>) from the database
+and returns it's spam_count, ham_count and last access time.
+
+=cut
+
+sub tok_get {
+ my ($self, $token) = @_;
+
+ my $sql = "SELECT spam_count, ham_count, atime
+ FROM bayes_token
+ WHERE username = ?
+ AND token = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: tok_get: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0,0);
+ }
+
+ my $rc = $sth->execute($self->{_username}, $token);
+
+ unless ($rc) {
+ dbg("bayes: tok_get: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0,0);
+ }
+
+ my ($spam_count, $ham_count, $atime) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ $spam_count = 0 if (!$spam_count || $spam_count < 0);
+ $ham_count = 0 if (!$ham_count || $ham_count < 0);
+ $atime = 0 if (!$atime);
+
+ return ($spam_count, $ham_count, $atime)
+}
+
+=head2 tok_count_change
+
+public instance (Boolean) tok_count_change (Integer $spam_count,
+ Integer $ham_count,
+ String $token,
+ String $atime)
+
+Description:
+This method takes a C<$spam_count> and C<$ham_count> and adds it to
+C<$tok> along with updating C<$tok>s atime with C<$atime>.
+
+=cut
+
+sub tok_count_change {
+ my ($self, $spam_count, $ham_count, $token, $atime) = @_;
+
+ $atime = 0 unless defined $atime;
+
+ $self->_put_token ($token, $spam_count, $ham_count, $atime);
+}
+
+=head2 nspam_nham_get
+
+public instance ($spam_count, $ham_count) nspam_nham_get ()
+
+Description:
+This method retrieves the total number of spam and the total number of
+ham learned.
+
+=cut
+
+sub nspam_nham_get {
+ my ($self) = @_;
+
+ my $sql = "SELECT ham_count, spam_count FROM bayes_vars WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0);
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
+ return (0,0);
+ }
+
+ my ($ham_count, $spam_count) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return ($spam_count || 0, $ham_count || 0);
+}
+
+=head2 nspam_nham_change
+
+public instance (Boolean) nspam_nham_change (Integer $num_spam,
+ Integer $num_ham)
+
+Description:
+This method updates the number of spam and the number of ham in the database.
+
+=cut
+
+sub nspam_nham_change {
+ my ($self, $num_spam, $num_ham) = @_;
+
+ my $sql = "UPDATE bayes_vars
+ SET spam_count = spam_count + ?,
+ ham_count = ham_count + ?
+ WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($num_spam, $num_ham, $self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 tok_touch
+
+public instance (Boolean) tok_touch (String $token,
+ String $atime)
+
+Description:
+This method updates the given tokens (C<$token>) atime.
+
+The assumption is that the token already exists in the database.
+
+=cut
+
+sub tok_touch {
+ my ($self, $token, $atime) = @_;
+
+ # shortcut, will only update atime for the token if the atime is less than
+ # what we are updating to
+ my $sql = "UPDATE bayes_token
+ SET atime = ?
+ WHERE username = ?
+ AND token = ?
+ AND atime < ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($atime, $self->{_username}, $token, $atime);
+
+ unless ($rc) {
+ dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 cleanup
+
+public instance (Boolean) cleanup ()
+
+Description:
+This method peroms any cleanup necessary before moving onto the next
+operation.
+
+=cut
+
+sub cleanup {
+ my ($self) = @_;
+
+ # Not used for this implementation
+
+ return 1;
+}
+
+=head2 is_magic_token
+
+public instance (Boolean) is_magic_token (string $token)
+
+Description:
+This method determines if a given token is "magic" or special to the
+implementation.
+
+=cut
+
+sub is_magic_token {
+ my ($self, $token) = @_;
+
+ return 0; # nothing is magic
+}
+
+=head2 sync
+
+public instance (Boolean) sync (\% $opts)
+
+Description:
+This method performs a sync of the database
+
+=cut
+
+sub sync {
+ my ($self, $opts) = @_;
+
+ # Not used for this implementation
+
+ return 1;
+}
+
+=head2 scan_count_get
+
+public instance (Integer) scan_count_get ()
+
+Description:
+Return the current scan count.
+
+Unused for SQL implementation.
+
+=cut
+
+sub scan_count_get {
+ my ($self) = @_;
+
+ return 0;
+}
+
+=head2 perform_upgrade
+
+public instance (Boolean) perform_upgrade (\% $opts);
+
+Description:
+Performs an upgrade of the database from one version to another, not
+currently used in this implementation.
+
+=cut
+
+sub perform_upgrade {
+ my ($self) = @_;
+
+ return 1;
+}
+
+=head1 Private Methods
+
+=head2 _get_db_version
+
+private instance (Integer) _get_db_version ()
+
+Description:
+Gets the current version of the database from the special global vars
+tables.
+
+=cut
+
+sub _get_db_version {
+ my ($self) = @_;
+
+ my $sql = "SELECT value FROM bayes_global_vars WHERE variable = 'VERSION'";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_db_version: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute();
+
+ unless ($rc) {
+ dbg("bayes: _get_db_version: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($version) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $version;
+}
+
+=head2 _initialize_db
+
+private instance (Boolean) _initialize_db ()
+
+Description:
+This method will check to see if a user has had their bayes variables
+initialized. If not then it will perform this initialization.
+
+=cut
+
+sub _initialize_db {
+ my ($self) = @_;
+
+ return 0 if (!$self->{_username});
+
+ my $sql = "SELECT count(*) FROM bayes_vars WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($count) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ if ($count) {
+ return 1;
+ }
+
+ # For now let the database setup the other variables as defaults
+ $sql = "INSERT INTO bayes_vars (username) VALUES (?)";
+
+ $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 _token_atime
+
+private instance (Integer) _token_atime (String $token)
+
+Description:
+This method returns a given tokens atime, it also serves to tell us
+if the token exists or not since the atime will be undefined if it
+does not exist.
+
+=cut
+
+sub _token_atime {
+ my ($self, $token) = @_;
+
+ return undef unless (defined($token));
+
+ my $sql = "SELECT atime
+ FROM bayes_token
+ WHERE username = ?
+ AND token = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+
+ my $rc = $sth->execute($self->{_username}, $token);
+
+ unless ($rc) {
+ dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
+ return undef;
+ }
+
+ my ($token_atime) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $token_atime;
+}
+
+=head2 _delete_token
+
+private instance (Boolean) _delete_token (String $token)
+
+Description:
+This method deletes the given token from the database.
+
+=cut
+
+sub _delete_token {
+ my ($self, $token) = @_;
+
+ return 0 unless (defined($token));
+
+ my $sql = "DELETE FROM bayes_token WHERE username = ? AND token = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username}, $token);
+
+ unless ($rc) {
+ dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 _put_token
+
+private instance (Boolean) _put_token (string $token,
+ integer $spam_count,
+ integer $ham_count,
+ string $atime)
+
+Description:
+This method performs the work of either inserting or updating a token in
+the database.
+
+=cut
+
+sub _put_token {
+ my ($self, $token, $spam_count, $ham_count, $atime) = @_;
+
+ $spam_count ||= 0;
+ $ham_count ||= 0;
+
+ my $existing_atime = $self->_token_atime($token);
+
+ my $sql;
+
+ if ($spam_count == 0 && $ham_count == 0) {
+ return 1;
+ }
+
+ if (!defined($existing_atime)) {
+
+ # You can't create a new entry for a token with a negative count, so just return
+ # if we are unable to find an entry.
+ return 1 if ($spam_count < 0 || $ham_count < 0);
+
+ $sql = "INSERT INTO bayes_token
+ (username, token, spam_count, ham_count, atime)
+ VALUES (?,?,?,?,?)";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username},
+ $token,
+ $spam_count,
+ $ham_count,
+ $atime);
+
+ unless ($rc) {
+ dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+ dbg("bayes: new token ($token) inserted");
+ }
+ else {
+ my $sql = "UPDATE bayes_token
+ SET spam_count = spam_count + ?,
+ ham_count = ham_count + ?,
+ atime = ?
+ WHERE username = ?
+ AND token = ?";
+
+ # If the existing atime is already greater then keep it.
+ # XXX - A future enhancement might be to just omit the update
+ # of atime in this case, but that would give us one extra
+ # SQL statement to cache, so I'm not sure if the trade off
+ # is worth it.
+ $atime = $existing_atime if ($existing_atime > $atime);
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($spam_count, $ham_count, $atime,
+ $self->{_username}, $token);
+
+ unless ($rc) {
+ dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+ dbg("bayes: token ($token) updated");
+ }
+ return 1;
+}
+
+=head2 _get_token_count
+
+private instance (Integer) _get_token_count ()
+
+Description:
+This method returns the total number of tokens present in the token database
+for a user.
+
+=cut
+
+sub _get_token_count {
+ my ($self) = @_;
+
+ my $sql = "SELECT count(*)
+ FROM bayes_token
+ WHERE username = ?
+ AND (spam_count > 0 OR ham_count > 0)";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($token_count) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $token_count
+}
+
+=head2 _get_oldest_token_age
+
+private instance (Integer) _get_oldest_token_age ()
+
+Description:
+This method finds the atime of the oldest token in the database.
+
+=cut
+
+sub _get_oldest_token_age {
+ my ($self) = @_;
+
+ my $sql = "SELECT min(atime) FROM bayes_token WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_oldest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _get_oldest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($atime) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $atime;
+}
+
+=head2 _get_newest_token_age
+
+private instance (Integer) _get_newest_token_age ()
+
+Description:
+This method finds the atime of the newest token in the database.
+
+=cut
+
+sub _get_newest_token_age {
+ my ($self) = @_;
+
+ my $sql = "SELECT max(atime) FROM bayes_token WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($atime) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $atime;
+}
+
+=head2 _set_last_atime_delta
+
+private instance (Boolean) _set_last_atime_delta (Integer $newdelta)
+
+Description:
+This method sets the last_atime_delta variable in the variable table.
+
+=cut
+
+sub _set_last_atime_delta {
+ my ($self, $newdelta) = @_;
+
+ return 0 unless (defined($newdelta));
+
+ my $sql = "UPDATE bayes_vars SET last_atime_delta = ? WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($newdelta, $self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 _set_last_expire_reduce
+
+private instance (Boolean) _set_last_expire_reduce (Integer $deleted)
+
+Description:
+This method sets the last_expire_reduce values in the variable table.
+
+=cut
+
+sub _set_last_expire_reduce {
+ my ($self, $deleted) = @_;
+
+ return 0 unless (defined($deleted));
+
+ my $sql = "UPDATE bayes_vars SET last_expire_reduce = ? WHERE username = ?";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($deleted, $self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ $sth->finish();
+
+ return 1;
+}
+
+=head2 _get_num_hapaxes
+
+private instance (Integer) _get_num_hapaxes ()
+
+Description:
+This method gets the total number of hapaxes (spam_count + ham_count == 1) in
+the token database for a user.
+
+=cut
+
+sub _get_num_hapaxes {
+ my ($self) = @_;
+
+ my $sql = "SELECT count(*)
+ FROM bayes_token
+ WHERE username = ?
+ AND spam_count + ham_count = 1";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_num_hapaxes: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _get_num_hapaxes: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+
+ my ($num_hapaxes) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $num_hapaxes;
+}
+
+=head2 _get_num_lowfreq
+
+private instance (Integer) _get_num_lowfreq ()
+
+Description:
+This method gets the total number of lowfreq tokens (spam_count < 8 and
+ham_count < 8) in the token database for a user
+
+=cut
+
+sub _get_num_lowfreq {
+ my ($self) = @_;
+
+ my $sql = "SELECT count(*)
+ FROM bayes_token
+ WHERE username = ?
+ AND (spam_count >= 0 AND spam_count < 8)
+ AND (ham_count >= 0 AND ham_count < 8)
+ AND spam_count + ham_count != 1";
+
+ my $sth = $self->{_dbh}->prepare_cached($sql);
+
+ unless (defined($sth)) {
+ dbg("bayes: _get_num_lowfreq: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my $rc = $sth->execute($self->{_username});
+
+ unless ($rc) {
+ dbg("bayes: _get_num_lowfreq: SQL Error: ".$self->{_dbh}->errstr());
+ return 0;
+ }
+
+ my ($num_lowfreq) = $sth->fetchrow_array();
+
+ $sth->finish();
+
+ return $num_lowfreq;
+}
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+sub sa_die { Mail::SpamAssassin::sa_die (@_); }
+
+1;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm Thu Jan 29 18:54:33 2004
@@ -170,9 +170,9 @@
$spamtest->{conf}->{bayes_path} = $bayes_override_path;
}

- my $ret = $spamtest->{bayes_scanner}->{store}->upgrade_old_dbm_files();
+ my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
$spamtest->finish_learner();
- return (!(defined $ret && $ret == 2));
+ return (!$ret);
}

$spamtest->init_learner({

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Thu Jan 29 18:54:33 2004
@@ -236,6 +236,15 @@
$self->{bayes_min_spam_num} = 200;
$self->{bayes_learn_during_report} = 1;

+ # Allow alternate bayes storage implementation
+ $self->{bayes_store_module} = '';
+
+ # Used for SQL based Bayes implementation
+ $self->{bayes_sql_dsn} = '';
+ $self->{bayes_sql_username} = '';
+ $self->{bayes_sql_password} = '';
+ $self->{bayes_sql_override_username} = '';
+
$self->{use_hashcash} = 1;
$self->{hashcash_accept} = { };
$self->{hashcash_doublespend_path} = '__userstate__/hashcash_seen';
@@ -263,6 +272,9 @@
# "...scope", and finally, 'user_scores_sql_table'. Defaults are "username",
# "preference", "value", "spamassassin" and "userpref".

+ # defaults for SQL based auto-whitelist
+ $self->{user_awl_sql_table} = 'awl';
+
# for backwards compatibility, we need to set the default headers
# remove this except for X-Spam-Checker-Version in 2.70
$self->add_default_spam_headers(); # always run this first
@@ -1759,7 +1771,18 @@
$self->{bayes_learn_during_report} = $value+0; next;
}

-=back
+=item bayes_sql_override_username
+
+Used by BayesStoreSQL storage implementation.
+
+If this options is set the BayesStoreSQL module will override the set username with
+the value given. This could be useful for implementing global or group bayes databases.
+
+=cut
+
+ if (/^bayes_sql_override_username\s+(.*)$/) {
+ $self->{bayes_sql_override_username} = $1; next;
+ }

##############

@@ -2413,6 +2436,57 @@
$self->{bayes_learn_to_journal} = $value+0; next;
}

+=item bayes_store_module
+
+If this option is set, the module given will be used as an alternate to the default
+bayes storage mechanism. It must conform to the published storage specification
+(see Mail::SpamAssassin::BayesStore).
+
+=cut
+
+ if (/^bayes_store_module\s+(.*)$/) {
+ my $module = $1;
+ $module =~ /^([_A-Za-z0-9:]+)$/;
+ $self->{bayes_store_module} = $1;
+ next;
+ }
+
+=item bayes_sql_dsn DBI::databasetype:databasename:hostname:port
+
+Used for BayesStoreSQL storage implementation.
+
+This option give the connect string used to connect to the SQL based Bayes storage.
+
+=cut
+
+ if (/^bayes_sql_dsn\s+(\S+)$/) {
+ $self->{bayes_sql_dsn} = $1; next;
+ }
+
+=item bayes_sql_username
+
+Used by BayesStoreSQL storage implementation.
+
+This option gives the username used by the above DSN.
+
+=cut
+
+ if (/^bayes_sql_username\s+(\S+)$/) {
+ $self->{bayes_sql_username} = $1; next;
+ }
+
+=item bayes_sql_password
+
+Used by BayesStoreSQL storage implementation.
+
+This option gives the password used by the above DSN.
+
+=cut
+
+ if (/^bayes_sql_password\s+(\S+)$/) {
+ $self->{bayes_sql_password} = $1; next;
+ }
+
=item user_scores_dsn DBI:databasetype:databasename:hostname:port

If you load user scores from an SQL database, this will set the DSN
@@ -2462,6 +2536,43 @@
# leave as RE right now
if (/^loadplugin\s+(\S+)\s+(\S+)$/) {
$self->load_plugin ($1, $2); next;
+ }
+
+=item user_awl_dsn DBI:databasetype:databasename:hostname:port
+
+If you load user auto-whitelists from an SQL database, this will set the DSN
+used to connect. Example: C<DBI:mysql:spamassassin:localhost>
+
+=cut
+ if (/^user_awl_dsn\s+(\S+)$/) {
+ $self->{user_awl_dsn} = $1; next;
+ }
+
+=item user_awl_sql_username username
+
+The authorized username to connect to the above DSN.
+
+=cut
+ if(/^user_awl_sql_username\s+(\S+)$/) {
+ $self->{user_awl_sql_username} = $1; next;
+ }
+
+=item user_awl_sql_password password
+
+The password for the database username, for the above DSN.
+
+=cut
+ if(/^user_awl_sql_password\s+(\S+)$/) {
+ $self->{user_awl_sql_password} = $1; next;
+ }
+
+=item user_awl_sql_table tablename
+
+The table user auto-whitelists are stored in, for the above DSN.
+
+=cut
+ if(/^user_awl_sql_table\s+(\S+)$/) {
+ $self->{user_awl_sql_table} = $1; next;
}

###########################################################################

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm Thu Jan 29 18:54:33 2004
@@ -0,0 +1,402 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+#
+# Copyright (c) 2000 The Apache Software Foundation. All rights
+# reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. The end-user documentation included with the redistribution,
+# if any, must include the following acknowledgment:
+# "This product includes software developed by the
+# Apache Software Foundation (http://www.apache.org/)."
+# Alternately, this acknowledgment may appear in the software itself,
+# if and wherever such third-party acknowledgments normally appear.
+#
+# 4. The names "Apache" and "Apache Software Foundation" must
+# not be used to endorse or promote products derived from this
+# software without prior written permission. For written
+# permission, please contact apache@apache.org.
+#
+# 5. Products derived from this software may not be called "Apache",
+# nor may "Apache" appear in their name, without prior written
+# permission of the Apache Software Foundation.
+#
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+#
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation. For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+#
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::SQLBasedAddrList - SpamAssassin SQL Based Auto Whitelist
+
+=head1 SYNOPSIS
+
+ my $factory = Mail::SpamAssassin::SQLBasedAddrList->new()
+ $spamtest->set_persistent_addr_list_factory ($factory);
+ ... call into SpamAssassin classes...
+
+SpamAssassin will call:
+
+ my $addrlist = $factory->new_checker($spamtest);
+ $entry = $addrlist->get_addr_entry ($addr, $origip);
+ ...
+
+=head1 DESCRIPTION
+
+A SQL based persistent address list implementation.
+
+See C<Mail::SpamAssassin::PersistentAddrList> for more information.
+
+Uses DBI::DBD module access to your favorite database (tested with
+MySQL, SQLite and PostgreSQL) to store user auto-whitelists.
+
+The default table structure looks like this:
+CREATE TABLE awl (
+ username VARCHAR NOT NULL,
+ email VARCHAR NOT NULL,
+ ip VARCHAR NOT NULL,
+ count INT NOT NULL,
+ totscore FLOAT NOT NULL,
+ PRIMARY KEY (username, email, ip)
+)
+
+You're table definition may change depending on which database driver
+you choose. There is a config option to override the table name.
+
+This module introduces several new config variables:
+
+user_awl_dsn
+
+user_awl_sql_username
+
+user_awl_sql_password
+
+user_awl_sql_table
+
+see C<Mail::SpamAssassin::Conf> for more information.
+
+
+=cut
+
+package Mail::SpamAssassin::SQLBasedAddrList;
+
+use strict;
+use bytes;
+
+use DBI;
+
+use Mail::SpamAssassin::PersistentAddrList;
+
+use vars qw(@ISA);
+
+@ISA = qw(Mail::SpamAssassin::PersistentAddrList);
+
+=head2 new
+
+public class (Mail::SpamAssassin::SQLBasedAddrList) new ()
+
+Description:
+This method creates a new instance of the SQLBasedAddrList factory and calls
+the parent's (PersistentAddrList) new method.
+
+=cut
+
+sub new {
+ my ($proto) = @_;
+ my $class = ref($proto) || $proto;
+ my $self = $class->SUPER::new(@_);
+ $self->{class} = $class;
+ bless ($self, $class);
+ $self;
+}
+
+=head2 new_checker
+
+public instance (Mail::SpamAssassin::SQLBasedAddrList) new_checker (\% $main)
+
+Description:
+This method is called to setup a new checker interface and return a blessed
+copy of itself. Here is where we setup the SQL database connection based
+on the config values.
+
+=cut
+
+sub new_checker {
+ my ($self, $main) = @_;
+
+ my $class = $self->{class};
+
+ if (!$main->{conf}->{user_awl_dsn} ||
+ !$main->{conf}->{user_awl_sql_table}) {
+ dbg("auto-whitelist (sql-based): invalid config");
+ return undef;
+ }
+
+ my $dsn = $main->{conf}->{user_awl_dsn};
+ my $dbuser = $main->{conf}->{user_awl_sql_username};
+ my $dbpass = $main->{conf}->{user_awl_sql_password};
+
+ my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {'PrintError' => 0});
+
+ if(!$dbh) {
+ dbg("auto-whitelist (sql-based): Unable to Connect to DB");
+ return undef;
+ }
+
+ $self = { 'main' => $main,
+ 'dsn' => $dsn,
+ 'dbh' => $dbh,
+ 'tablename' => $main->{conf}->{user_awl_sql_table},
+ };
+
+ dbg("SQL Based AWL: Connected to $dsn");
+
+ return bless ($self, $class);
+}
+
+=head2 get_addr_entry
+
+public instance (\%) get_addr_entry (String $addr)
+
+Description:
+This method takes a given C<$addr> and splits it between the email address
+component and the ip component and performs a lookup in the database. If
+nothing is found in the database then a blank entry hash is created and
+returned, otherwise an entry containing the found information is returned.
+
+A key, C<exists_p>, is set to 1 if an entry already exists in the database,
+otherwise it is set to 0.
+
+=cut
+
+sub get_addr_entry {
+ my ($self, $addr) = @_;
+
+ my $entry = { addr => $addr,
+ exists_p => 0,
+ count => 0,
+ totscore => 0,
+ };
+
+ my ($email, $ip) = $self->_unpack_addr($addr);
+
+ return $entry unless ($email && $ip);
+
+ my $username = $self->{main}->{username};
+
+ my $sql = "SELECT count, totscore FROM $self->{tablename}
+ WHERE username = ? AND email = ? AND ip = ?";
+ my $sth = $self->{dbh}->prepare($sql);
+ my $rc = $sth->execute($username, $email, $ip);
+
+ if (!$rc) { # there was an error, but try to go on
+ my $err = $self->{dbh}->errstr;
+ dbg("auto-whitelist (sql-based) get_addr_entry: SQL Error: $err");
+ $entry->{count} = 0;
+ $entry->{totscore} = 0;
+ }
+ else {
+ my $aryref = $sth->fetchrow_arrayref();
+
+ if (defined($aryref)) { # we got some data back
+ $entry->{count} = $aryref->[0] || 0;
+ $entry->{totscore} = $aryref->[1] || 0;
+ $entry->{exists_p} = 1;
+ dbg("auto-whitelist (sql-based) get_addr_entry: Found existing entry for $addr");
+ }
+ else {
+ dbg("auto-whitelist (sql-based) get_addr_entry: No entry found for $addr");
+ }
+ }
+ $sth->finish();
+
+ dbg ("auto-whitelist (sql-based): $addr scores ".$entry->{count}.'/'.$entry->{totscore});
+
+ return $entry;
+}
+
+=head2 add_score
+
+public instance (\%) add_score (\% $entry, Integer $score)
+
+Description:
+This method adds a given C<$score> to a given C<$entry>. If the entry was
+marked as not existing in the database then an entry will be inserted,
+otherwise a simple update will be performed.
+
+NOTE: This code uses a self referential SQL call (ie set foo = foo + 1) which
+is supported by most modern database backends, but not everything calling
+itself a SQL database.
+
+=cut
+
+sub add_score {
+ my($self, $entry, $score) = @_;
+
+ return if (!$entry->{addr});
+
+ my ($email, $ip) = $self->_unpack_addr($entry->{addr});
+
+ $entry->{count} += 1;
+ $entry->{totscore} += $score;
+
+ return $entry unless ($email && $ip);
+
+ my $username = $self->{main}->{username};
+
+ if ($entry->{exists_p}) { # entry already exists, so just update
+ my $sql = "UPDATE $self->{tablename} SET count = count + 1,
+ totscore = totscore + ?
+ WHERE username = ? AND email = ? AND ip = ?";
+
+ my $sth = $self->{dbh}->prepare($sql);
+ my $rc = $sth->execute($score, $username, $email, $ip);
+
+ if (!$rc) {
+ my $err = $self->{dbh}->errstr;
+ dbg("auto-whitelist (sql-based) add_score: SQL Error: $err");
+ }
+ else {
+ dbg("auto-whitelist (sql-based) add_score: New count: ". $entry->{count} .", new totscore: ".$entry->{totscore}." for ".$entry->{addr});
+ }
+ $sth->finish();
+ }
+ else { # no entry yet, so insert a new entry
+ my $sql = "INSERT INTO $self->{tablename} (username,email,ip,count,totscore) VALUES (?,?,?,?,?)";
+ my $sth = $self->{dbh}->prepare($sql);
+ my $rc = $sth->execute($username,$email,$ip,1,$score);
+ if (!$rc) {
+ my $err = $self->{dbh}->errstr;
+ dbg("auto-whitelist (sql-based) add_score: SQL Error: $err");
+ }
+ $entry->{exists_p} = 1;
+ dbg("auto-whitelist (sql-based) add_score: Created new entry for ".$entry->{addr}." with totscore: $score");
+ $sth->finish();
+ }
+
+ return $entry;
+}
+
+=head2 remove_entry
+
+public instance () remove_entry (\% $entry)
+
+Description:
+This method removes a given C<$entry> from the database. If the
+ip portion of the entry address is equal to "none" then remove any
+perl-IP entries for this address as well.
+
+=cut
+
+sub remove_entry {
+ my ($self, $entry) = @_;
+
+ my ($email, $ip) = $self->_unpack_addr($entry->{addr});
+
+ return unless ($email && $ip);
+
+ my $username = $self->{main}->{username};
+
+ my $sql;
+ my @args;
+
+ # when $ip is equal to none then attempt to delete all entries
+ # associated with address
+ if ($ip eq 'none') {
+ $sql = "DELETE FROM $self->{tablename} WHERE username = ? AND email = ?";
+ @args = ($username, $email);
+ dbg("auto-whitelist (sql-based) remove_entry: Removing all entries matching $email");
+ }
+ else {
+ $sql = "DELETE FROM $self->{tablename}
+ WHERE username = ? AND email = ? AND ip = ?";
+ @args = ($username, $email, $ip);
+ dbg("auto-whitelist (sql-based) remove_entry: Removing single entry matching ".$entry->{addr});
+ }
+
+ my $sth = $self->{dbh}->prepare($sql);
+ my $rc = $sth->execute(@args);
+
+ if (!$rc) {
+ my $err = $self->{dbh}->errstr;
+ dbg("auto-whitelist (sql-based) remove_entry: SQL Error: $err");
+ }
+ else {
+ # We might normally have a dbg saying we removed the address
+ # but the common codepath already provides this in SpamAssassin.pm
+ }
+ $entry = undef; # slight cleanup since it is now gone
+}
+
+=head2 finish
+
+public instance () finish ()
+
+Description:
+This method provides the necessary cleanup for the address list.
+
+=cut
+
+sub finish {
+ my ($self) = @_;
+ dbg("auto-whitelist (sql-based) finish: Disconnected from " . $self->{dsn});
+ $self->{dbh}->disconnect();
+}
+
+=head2 _unpack_addr
+
+private instance (String, String) _unpack_addr(string $addr)
+
+Description:
+This method splits an autowhitelist address into it's two components,
+email and ip address.
+
+=cut
+
+sub _unpack_addr {
+ my ($self, $addr) = @_;
+
+ my ($email, $ip) = split(/\|ip=/, $addr);
+
+ unless ($email && $ip) {
+ dbg("auto-whitelist (sql-based): _unpack_addr: Unable to decode $addr");
+ }
+
+ return ($email, $ip);
+}
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+
+1;

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf (original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf Thu Jan 29 18:54:33 2004
@@ -442,3 +442,7 @@
body T_RM_BPT_LONGWORDS_99 /\b(?:\w{9,}\s+){9}/
describe T_RM_BPT_LONGWORDS_99 Long string of long words

+# "www" hidden as "%77%77%77", "ww%77", etc.
+rawbody T_HTTP_77 /http:\/\/.{0,2}[\%77]/
+describe T_HTTP_77 Contains a URL-encoded hostname (HTTP77)
+

Modified: incubator/spamassassin/trunk/sql/README
==============================================================================
--- incubator/spamassassin/trunk/sql/README (original)
+++ incubator/spamassassin/trunk/sql/README Thu Jan 29 18:54:33 2004
@@ -1,6 +1,6 @@

-Using SpamAssassin With An SQL Database
----------------------------------------
+Loading SpamAssassin User Preferences From An SQL Database
+----------------------------------------------------------

SpamAssassin can now load users' score files from an SQL database. The concept
here is to have a web application (PHP/perl/ASP/etc.) that will allow users to
@@ -77,12 +77,18 @@
you can use the entire recipient's email address, e.g. "user@example.com", and
use the full varchar(100).

-Included is a default table that can be safely used in your own setup. To
-use the default table, you must first create a database, and a username/password
-that can access that database. To install the table, use the following
-command:
+Included is a default table that can be safely used in your own setup. To use
+the default table, you must first create a database, and a username/password
+that can access that database.

-mysql -h <hostname> -u <username> -p <password> databasename < spamassasin.sql
+To create a database, if one does not already exist, see "Creating A Database
+In MySQL" below.
+
+
+To install the table, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p <databasename> < spamassasin.sql
+Enter password: <adminpassword>

This will create the following table:

@@ -115,6 +121,21 @@

Also note that spamd may need the "-q" switch so it knows to look up users in
the SQL table instead of /etc/passwd. See "man spamd".
+
+
+Creating A Database In MySQL
+----------------------------
+
+Here's the command to create a database, and user/password pair to access
+it, for MySQL:
+
+mysql -h <hostname> -u <adminusername> -p
+Enter password: <adminpassword>
+mysql> use mysql;
+mysql> insert into user (Host, User, Password) values('localhost','<username>', password('<password>'));
+mysql> insert into db (Host, Db, User, Select_priv) values('localhost','<databasename>','<username>','Y');
+mysql> create database <databasename>;
+mysql> quit


Testing SpamAssassin/SQL

Added: incubator/spamassassin/trunk/sql/README.awl
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/README.awl Thu Jan 29 18:54:33 2004
@@ -0,0 +1,139 @@
+
+Using SpamAssassin Auto-Whitelists With An SQL Database
+-------------------------------------------------------
+
+SpamAssassin can now load users' auto-whitelists from a SQL database.
+The most common use for a system like this would be for users to be
+able to have per user auto-whitelists on systems where users may not
+have a home directory to store the whitelist DB files.
+
+In order to activate the SQL based auto-whitelist you have to
+configure spamassassin and spamd to use a different whitelist factory.
+This is done with the auto_whitelist_factory config variable, like
+so:
+
+auto_whitelist_factory Mail::SpamAssassin::SQLBasedAddrList
+
+SpamAssassin will check the global configuration file (ie. any file
+matching /etc/mail/spamassassin/*.cf) for the following settings:
+
+user_awl_dsn DBI:driver:database:hostname[:port]
+user_awl_sql_username dbusername
+user_awl_sql_password dbpassword
+
+The first option, user_awl_dsn, describes the data source name that
+will be used to create the connection to your SQL server. It MUST be
+in the format as listed above. <driver> should be the DBD driver that
+you have installed to access your database (initially tested with
+MySQL, PostgreSQL and SQLite). <database> must be the name of the
+database that you created to store the auto-whitelist table.
+<hostname> is the name of the host that contains the SQL database
+server. <port> is the optional port number where your database server
+is listening.
+
+user_awl_dsn DBI:mysql:spamassassin:localhost
+
+Would tell SpamAssassin to connect to the database named spamassassin using
+MySQL on the local server, and since <port> is omitted, the driver will use the
+default port number. The other two required options tells SpamAssassin to use
+the defined username and password to establish the connection.
+
+If the user_awl_dsn option does not exist, SpamAssassin will not attempt
+to use SQL for the auto-whitelist.
+
+One additional configuration option exists that allows you to set the
+table name for the auto-whitelist table.
+
+user_awl_sql_table awl
+
+Requirements
+------------
+
+In order for SpamAssassin to work with your SQL database, you must have
+the perl DBI module installed, AS WELL AS the DBD driver/module for your
+specific database. For example, if using MySQL as your RDBMS, you must have
+the Msql-Mysql module installed. Check CPAN for the latest versions of DBI
+and your database driver/module.
+
+We are currently using:
+
+DBI-1.20
+Msql-Mysql-modules-1.2219
+perl v5.6.1
+
+But older versions should work fine.
+
+
+Database Schema
+---------------
+
+The database must contain a table named by 'user_awl_sql_table' (default
+setting: "awl") with at least three fields:
+
+ username varchar(100) # this is the username whose e-mail is being filtered
+ email varchar(200) # this is the address key
+ ip varchar(10) # this is the ip key
+ count int(11) # this is the message counter
+ totscore float # this is the total calculated score
+
+You can add as many other fields you wish as long as the above fields
+are contained in the table.
+
+Included is a default table that can be safely used in your own setup. To use
+the default table, you must first create a database, and a username/password
+that can access that database. (See "Creating A Database In MySQL", in
+"sql/README", if you don't have a suitable database ready.)
+
+To install the table, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p <databasename> < awl_mysql.sql
+Enter password: <adminpassword>
+
+This will create the following table:
+
+CREATE TABLE awl (
+ username varchar(100) NOT NULL default '',
+ email varchar(200) NOT NULL default '',
+ ip varchar(10) NOT NULL default '',
+ count int(11) default '0',
+ totscore float default '0',
+ PRIMARY KEY (username,email,ip)
+) TYPE=MyISAM;
+
+
+Once you have created the database and added the table, just add the required
+lines to your global configuration file (local.cf). Note that you
+must specify the proper whitelist factory in the config file in order
+for this to work and the current username must be passed to spamd.
+
+Testing SpamAssassin/SQL
+------------------------
+
+To test your SQL setup, and debug any possible problems, you should start
+spamd with the -D option, which will keep spamd in the foreground, and will
+output debug message to the terminal. You should then test spamd with a
+message by calling spamc. You can use the sample-spam.txt file with the
+following command:
+
+cat sample-spam.txt | spamc
+
+Watch the debug output from spamd and look for the following debug line:
+
+SQL Based AWL: Connected to <your dsn>
+
+If you do not see the above text, then the SQL query was not successful, and
+you should consult any error messages reported.
+
+This code has been tested using MySQL as the RDBMS, with basic tests
+against PostgreSQL and SQLite. It has been written with the utmost
+simplicity using DBI, and any database driver that conforms to the DBI
+interface and allows you to refer to a column on the right hand side
+of an expression (ie update foo set bar = bar + 1) should work with
+little or no problems. If you find a driver that has issues, please
+report them to the SADev list.
+
+******
+NB: This should be considered BETA, and the interface, schema, or overall
+operation of SQL support may change at any time with future releases of SA.
+******
+

Added: incubator/spamassassin/trunk/sql/README.bayes
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/README.bayes Thu Jan 29 18:54:33 2004
@@ -0,0 +1,125 @@
+
+Using A SQL Database for Bayesian Storage Module
+-------------------------------------------------------
+
+SpamAssassin can now store users' bayesian filter data in a SQL
+database. The most common use for a system like this would be for
+users to be able to have per user bayesian filter data on systems
+where users may not have a home directory to store the data.
+
+In order to activate the SQL based bayesian storage you have to
+configure spamassassin and spamd to use a different bayes storage
+module. This can be done via a setting in the global configuration
+file.
+
+The directives required to turn on the SQL based bayesian storage are:
+
+bayes_store_module Mail::SpamAssassin::BayesStoreSQL
+
+This directive is used by the Bayes module to determine which storage
+module should be used. If not set it will default to:
+Mail::SpamAssassin::BayesStoreDBM
+
+bayes_sql_dsn DBI:driver:database:hostname[:port]
+bayes_sql_username dbusername
+bayes_sql_password dbpassword
+
+The bayes_sql_dsn directive describes the data source name that will
+be used to create the connection to your SQL server. It MUST be in
+the format as listed above. <driver> should be the DBD driver that
+you have installed to access your database (initially tested with
+MySQL, PostgreSQL, SQLite, and DBD::CSV). <database> must be the name
+of the database that you created to store the bayes data
+tables. <hostname> is the name of the host that contains the SQL
+database server. <port> is the optional port number where your
+database server is listening.
+
+In addition to the global configuration directives there is a user
+preference:
+
+bayes_sql_override_username someusername
+
+This directive, if used, will override the username used for storing
+data in the database. This could be used to group users together to
+share bayesian filter data.
+
+Requirements
+------------
+
+In order for SpamAssassin to work with your SQL database, you must
+have the perl DBI module installed, AS WELL AS the DBD driver/module
+for your specific database. For example, if using MySQL as your
+RDBMS, you must have the DBD::mysql module installed. Check CPAN for
+the latest versions of DBI and your database driver/module.
+
+The BayesStoreSQL module was tested with:
+
+DBI-1.38
+DBD-mysql-2.9002
+perl v5.8.0
+
+But older versions should work fine as the SQL code in SpamAssassin is as
+simple as could be.
+
+Database Schema
+---------------
+
+The database schema for storage of the bayesian filter data contains
+several different tables. Several sample SQL schemas have been
+included in to help in setting up your database. The schemas contain
+the minimum tables and columns necessary to work with the code as
+written. You are free to add other columns as needed for your local
+implementation. Presently there is no way to override the table and
+column names used by the BayesStoreSQL code, this feature may be added
+in the future.
+
+Example setup of bayes tables for MySQL:
+
+This assumes that you have already created a database for use with
+spamassassin and setup a username/password that can access that database.
+(See "Creating A Database In MySQL", in "sql/README", if you don't have a
+suitable database ready.)
+
+To install the tables using the included example, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p databasename < bayes_mysql.sql
+Enter password: <adminpassword>
+
+Once you have created the database and added the tables, just add the
+required lines to your global configuration file (local.cf).
+
+Testing SpamAssassin/SQL
+------------------------
+
+To test your SQL setup, and debug any possible problems, you should
+start spamd with the -D option, which will keep spamd in the
+foreground, and will output debug message to the terminal. You should
+then test spamd with a message by calling spamc. You can use the
+sample-spam.txt file with the following command:
+
+cat sample-spam.txt | spamc
+
+Watch the debug output from spamd and look for the following debug
+line:
+
+debug: bayes: Database connection established
+debug: bayes: Using username: <username>
+
+If you do not see the above text, then the SQL query was not
+successful, and you should see any error messages reported.
+
+This code has been tested using MySQL as the RDMS, with basic tests
+against PostgreSQL and SQLite. It does require a database that allows
+you to refer to a column on the right hand side of an expression (ie
+update foo set bar = bar + 1). Any database driver that allows for
+that usage should work with the BayesStoreSQL code. NOTE: You may
+find that some implementations do not provide a significant advantage
+over using the default DBM implementation. If you find a driver that
+should work and has issues, please report them to the SADev list.
+
+******
+NB: This should be considered BETA, and the interface, schema, or
+overall operation of SQL support may change at any time with future
+releases of SA.
+******
+

Added: incubator/spamassassin/trunk/sql/awl_mysql.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/awl_mysql.sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,8 @@
+CREATE TABLE awl (
+ username varchar(100) NOT NULL default '',
+ email varchar(200) NOT NULL default '',
+ ip varchar(10) NOT NULL default '',
+ count int(11) default '0',
+ totscore float default '0',
+ PRIMARY KEY (username,email,ip)
+) TYPE=MyISAM;

Added: incubator/spamassassin/trunk/sql/bayes_mysql.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_mysql.sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,41 @@
+
+CREATE TABLE bayes_expire (
+ username varchar(200) NOT NULL default '',
+ runtime int(11) NOT NULL default '0',
+ KEY bayes_expire_idx1 (username)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_global_vars (
+ variable varchar(30) NOT NULL default '',
+ value varchar(200) NOT NULL default '',
+ PRIMARY KEY (variable)
+) TYPE=MyISAM;
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+ username varchar(200) NOT NULL default '',
+ msgid varchar(200) binary NOT NULL default '',
+ flag char(1) NOT NULL default '',
+ PRIMARY KEY (username,msgid),
+ KEY bayes_seen_idx1 (username,flag)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_token (
+ username varchar(200) NOT NULL default '',
+ token varchar(200) binary NOT NULL default '',
+ spam_count int(11) NOT NULL default '0',
+ ham_count int(11) NOT NULL default '0',
+ atime int(11) NOT NULL default '0',
+ PRIMARY KEY (username,token)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_vars (
+ username varchar(200) NOT NULL default '',
+ spam_count int(11) NOT NULL default '0',
+ ham_count int(11) NOT NULL default '0',
+ last_expire int(11) NOT NULL default '0',
+ last_atime_delta int(11) NOT NULL default '0',
+ last_expire_reduce int(11) NOT NULL default '0',
+ PRIMARY KEY (username)
+) TYPE=MyISAM;

Added: incubator/spamassassin/trunk/sql/bayes_pg.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_pg.sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,43 @@
+
+CREATE TABLE bayes_expire (
+ username varchar(200) NOT NULL default '',
+ runtime integer NOT NULL default '0'
+);
+
+CREATE INDEX bayes_expire_idx1 ON bayes_expire (username);
+
+CREATE TABLE bayes_global_vars (
+ variable varchar(30) NOT NULL default '',
+ value varchar(200) NOT NULL default '',
+ PRIMARY KEY (variable)
+);
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+ username varchar(200) NOT NULL default '',
+ msgid varchar(200) NOT NULL default '',
+ flag character(1) NOT NULL default '',
+ PRIMARY KEY (username,msgid)
+);
+
+CREATE INDEX bayes_seen_idx1 ON bayes_seen (username, flag);
+
+CREATE TABLE bayes_token (
+ username varchar(200) NOT NULL default '',
+ token varchar(200) NOT NULL default '',
+ spam_count integer NOT NULL default '0',
+ ham_count integer NOT NULL default '0',
+ atime integer NOT NULL default '0',
+ PRIMARY KEY (username,token)
+);
+
+CREATE TABLE bayes_vars (
+ username varchar(200) NOT NULL default '',
+ spam_count integer NOT NULL default '0',
+ ham_count integer NOT NULL default '0',
+ last_expire integer NOT NULL default '0',
+ last_atime_delta integer NOT NULL default '0',
+ last_expire_reduce integer NOT NULL default '0',
+ PRIMARY KEY (username)
+);

Added: incubator/spamassassin/trunk/sql/bayes_sqlite.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_sqlite.sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,43 @@
+
+CREATE TABLE bayes_expire (
+ username varchar(200) NOT NULL default '',
+ runtime int(11) NOT NULL default '0',
+);
+
+CREATE INDEX bayes_expire_idx1 ON bayes_expire (username);
+
+CREATE TABLE bayes_global_vars (
+ variable varchar(30) NOT NULL default '',
+ value varchar(200) NOT NULL default '',
+ PRIMARY KEY (variable)
+);
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+ username varchar(200) NOT NULL default '',
+ msgid varchar(200) NOT NULL default '',
+ flag varchar(10) NOT NULL default '',
+ PRIMARY KEY (username,msgid)
+);
+
+CREATE INDEX bayes_seen_idx1 ON bayes_seen (username, flag);
+
+CREATE TABLE bayes_token (
+ username varchar(200) NOT NULL default '',
+ token varchar(200) NOT NULL default '',
+ spam_count int(11) NOT NULL default '0',
+ ham_count int(11) NOT NULL default '0',
+ atime int(11) NOT NULL default '0',
+ PRIMARY KEY (username,token)
+);
+
+CREATE TABLE bayes_vars (
+ username varchar(200) NOT NULL default '',
+ spam_count int(11) NOT NULL default '0',
+ ham_count int(11) NOT NULL default '0',
+ last_expire int(11) NOT NULL default '0',
+ last_atime_delta int(11) NOT NULL default '0',
+ last_expire_reduce int(11) NOT NULL default '0',
+ PRIMARY KEY (username)
+);

Modified: incubator/spamassassin/trunk/t/SATest.pm
==============================================================================
--- incubator/spamassassin/trunk/t/SATest.pm (original)
+++ incubator/spamassassin/trunk/t/SATest.pm Thu Jan 29 18:54:33 2004
@@ -50,15 +50,22 @@
$spamc = $ENV{'SPAMC_SCRIPT'};
$spamc ||= "../spamc/spamc";

+ $salearn = $ENV{'SALEARN_SCRIPT'};
+ $salearn ||= "$perl_cmd ../sa-learn";
+
$spamdport = $ENV{'SPAMD_PORT'};
$spamdport ||= 48373; # whatever
$spamd_cf_args = "-C log/test_rules_copy";
$spamd_localrules_args = " --siteconfigpath log/localrules.tmp";
$scr_localrules_args = " --siteconfigpath log/localrules.tmp";
+ $salearn_localrules_args = " --siteconfigpath log/localrules.tmp";

$scr_cf_args = "-C log/test_rules_copy";
$scr_pref_args = "-p log/test_default.cf";
+ $salearn_cf_args = "-C log/test_rules_copy";
+ $salearn_pref_args = "-p log/test_default.cf";
$scr_test_args = "";
+ $salearn_test_args = "";
$set_test_prefs = 0;
$default_cf_lines = "
bayes_path ./log/user_state/bayes
@@ -164,6 +171,40 @@
1;
}

+# Run salearn. Calls back with the output.
+# in $args: arguments to run with
+# in $read_sub: callback for the output (should read from <IN>).
+# This is called with no args.
+#
+# out: $salearn_exitcode global: exitcode from sitescooper
+# ret: undef if sitescooper fails, 1 for exit 0
+#
+sub salearnrun {
+ my $args = shift;
+ my $read_sub = shift;
+
+ rmtree ("log/outputdir.tmp"); # some tests use this
+ mkdir ("log/outputdir.tmp", 0755);
+
+ %found = ();
+ %found_anti = ();
+
+ if (defined $ENV{'SA_ARGS'}) {
+ $args = $ENV{'SA_ARGS'} . " ". $args;
+ }
+ $args = "$salearn_cf_args $salearn_localrules_args $salearn_pref_args $salearn_test_args $args";
+
+ # added fix for Windows tests from Rudif
+ my $salearnargs = "$salearn $args";
+ $salearnargs =~ s!/!\\!g if ($^O =~ /^MS(DOS|Win)/i);
+ print ("\t$salearnargs\n");
+ system ("$salearnargs > log/$testname.${Test::ntest}");
+ $salearn_exitcode = ($?>>8);
+ if ($salearn_exitcode != 0) { return undef; }
+ &checkfile ("$testname.${Test::ntest}", $read_sub);
+ 1;
+}
+
sub scrun {
$spamd_never_started = 1;
spamcrun (@_);
@@ -351,6 +392,31 @@
return $killed;
}
}
+
+sub create_saobj {
+ my ($args) = shift; # lets you override/add arguments
+
+ # YUCK, these file/dir names should be some sort of variable, at
+ # least we keep their definition in the same file for the moment.
+ my %setup_args = ( rules_filename => 'log/test_rules_copy',
+ site_rules_filename => 'log/localrules.tmp',
+ userprefs_filename => 'log/test_default.cf',
+ userstate_dir => 'log/user_state',
+ );
+
+ # override default args
+ foreach my $arg (keys %$args) {
+ $setup_args{$arg} = $args->{$arg};
+ }
+
+ # We'll assume that the test has setup INC correctly
+ require Mail::SpamAssassin;
+
+ my $sa = Mail::SpamAssassin->new(\%setup_args);
+
+ return $sa;
+}
+

# ---------------------------------------------------------------------------


Added: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/bayesdbm.t Thu Jan 29 18:54:33 2004
@@ -0,0 +1,275 @@
+#!/usr/bin/perl
+
+use Data::Dumper;
+use lib '.'; use lib 't';
+use SATest; sa_t_init("bayes");
+use Test;
+
+BEGIN {
+ if (-e 't/test_dir') {
+ chdir 't';
+ }
+
+ if (-e 'test_dir') {
+ unshift(@INC, '../blib/lib');
+ }
+
+ plan tests => 43
+};
+
+tstlocalrules ("
+ bayes_learn_to_journal 0
+");
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::MsgParser;
+
+my $sa = create_saobj();
+
+$sa->init();
+
+ok($sa);
+
+ok($sa->{bayes_scanner});
+
+ok(!$sa->{bayes_scanner}->is_scan_available());
+
+open(MAIL,"< data/spam/001");
+
+my $raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+ok($raw_message);
+
+my @msg;
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+ok($mail);
+
+my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+ok($body);
+
+my ($wc, @toks) = $sa->{bayes_scanner}->tokenize($mail, $body);
+
+ok($wc > 0);
+
+ok(scalar(@toks) > 0);
+
+my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+
+ok($msgid eq '9PS291LhupY');
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(!$sa->{bayes_scanner}->learn(1, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+my $tokerror = 0;
+foreach my $tok (@toks) {
+ my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+ if ($spam == 0 || $ham > 0) {
+ $tokerror = 1;
+ }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(0, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+$tokerror = 0;
+foreach my $tok (@toks) {
+ my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+ if ($spam > 0 || $ham == 0) {
+ $tokerror = 1;
+ }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->forget($mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+ bayes_learn_to_journal 1
+");
+
+$sa = create_saobj();
+
+$sa->init();
+
+# Slight cheat here, because when you learn only to journal it fails
+# to actually create the bayes_toks and bayes_seen files because we
+# are tieing read only, this will create the files for us and allow
+# things to continue
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(-e 'log/user_state/bayes_journal');
+
+$sa->{bayes_scanner}->sync(1); # always returns 0, so no need to check return
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok(-e 'log/user_state/bayes_seen');
+
+ok(-e 'log/user_state/bayes_toks');
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+bayes_learn_to_journal 0
+bayes_min_spam_num 10
+bayes_min_ham_num 10
+");
+
+# we get to bastardize the existing pattern matching code here. It lets us provide
+# our own checking callback and keep using the existing ok_all_patterns call
+%patterns = ( 1 => 'Learned from message' );
+
+ok(salearnrun("--spam data/spam", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/nice", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/whitelists", \&check_examined));
+ok_all_patterns();
+
+%patterns = ( 'non-token data: bayes db version' => 'db version' );
+ok(salearnrun("--dump magic", \&patterns_run_cb));
+ok_all_patterns();
+
+use constant SCAN_USING_PERL_CODE_TEST => 1;
+# jm: off! not working for some reason. Mind you, this is
+# not a supported way to call these APIs! so no biggie
+
+if (SCAN_USING_PERL_CODE_TEST) {
+$sa = create_saobj();
+
+$sa->init();
+
+open(MAIL,"< ../sample-nonspam.txt");
+
+$raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+ok($msgstatus);
+
+my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+print "\treturned score: $score\n";
+ok($score != .5);
+
+open(MAIL,"< ../sample-spam.txt");
+
+$raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+
+}
+
+sub check_examined {
+ local ($_);
+ my $string = shift;
+
+ if (defined $string) {
+ $_ = $string;
+ } else {
+ $_ = join ('', <IN>);
+ }
+
+ if ($_ =~ /Learned from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
+ $found{'Learned from message'}++;
+ }
+}
+
+

Added: incubator/spamassassin/trunk/t/bayessql.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/bayessql.t Thu Jan 29 18:54:33 2004
@@ -0,0 +1,310 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest;
+use Test;
+use DBI; # for our cleanup stuff
+
+use constant TEST_ENABLED => (-e 'bayessql.cf' || -e 't/bayessql.cf');
+
+BEGIN {
+ if (-e 't/test_dir') {
+ chdir 't';
+ }
+
+ if (-e 'test_dir') {
+ unshift(@INC, '../blib/lib');
+ }
+
+ plan tests => (TEST_ENABLED ? 38 : 0);
+
+ onfail => sub {
+ warn "\n\nNote: Failure may be due to an incorrect config.";
+ }
+};
+
+exit unless TEST_ENABLED;
+
+my $dbconfig;
+my $dbdsn;
+my $dbusername;
+my $dbpassword;
+
+open(CONFIG,"<bayessql.cf");
+while (my $line = <CONFIG>) {
+ $dbconfig .= $line;
+ if ($line =~ /^bayes_sql_dsn (.*)/) {
+ $dbdsn = $1;
+ chomp($dbdsn);
+ }
+ elsif ($line =~ /^bayes_sql_username (.*)/) {
+ $dbusername = $1;
+ chomp($dbusername);
+ }
+ elsif ($line =~ /^bayes_sql_password (.*)/) {
+ $dbpassword = $1;
+ chomp($dbpassword);
+ }
+}
+close(CONFIG);
+
+my $testuser = 'tstusr.'.$$.'.'.time();
+
+sa_t_init("bayes");
+
+tstlocalrules ("
+bayes_store_module Mail::SpamAssassin::BayesStoreSQL
+$dbconfig
+bayes_sql_override_username $testuser
+");
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::MsgParser;
+
+my $sa = create_saobj();
+
+$sa->init();
+
+ok($sa);
+
+ok($sa->{bayes_scanner});
+
+ok(!$sa->{bayes_scanner}->is_scan_available());
+
+open(MAIL,"< data/spam/001");
+
+my $raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+ok($raw_message);
+
+my @msg;
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+ok($mail);
+
+my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+ok($body);
+
+my ($wc, @toks) = $sa->{bayes_scanner}->tokenize($mail, $body);
+
+ok($wc > 0);
+
+ok(scalar(@toks) > 0);
+
+my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+
+ok($msgid eq '9PS291LhupY');
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(!$sa->{bayes_scanner}->learn(1, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+my $tokerror = 0;
+foreach my $tok (@toks) {
+ my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+ if ($spam == 0 || $ham > 0) {
+ $tokerror = 1;
+ }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(0, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+$tokerror = 0;
+foreach my $tok (@toks) {
+ my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+ if ($spam > 0 || $ham == 0) {
+ $tokerror = 1;
+ }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->forget($mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+undef $sa;
+
+ok(cleanupdb());
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+bayes_store_module Mail::SpamAssassin::BayesStoreSQL
+$dbconfig
+bayes_min_spam_num 10
+bayes_min_ham_num 10
+bayes_sql_override_username $testuser
+");
+
+# we get to bastardize the existing pattern matching code here. It lets us provide
+# our own checking callback and keep using the existing ok_all_patterns call
+%patterns = ( 1 => 'Learned from message' );
+
+ok(salearnrun("--spam data/spam", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/nice", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/whitelists", \&check_examined));
+ok_all_patterns();
+
+%patterns = ( 'non-token data: bayes db version' => 'db version' );
+ok(salearnrun("--dump magic", \&patterns_run_cb));
+ok_all_patterns();
+
+
+use constant SCAN_USING_PERL_CODE_TEST => 1;
+# jm: off! not working for some reason. Mind you, this is
+# not a supported way to call these APIs! so no biggie
+
+if (SCAN_USING_PERL_CODE_TEST) {
+$sa = create_saobj();
+
+$sa->init();
+
+open(MAIL,"< ../sample-nonspam.txt");
+
+$raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+ok($msgstatus);
+
+my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+
+open(MAIL,"< ../sample-spam.txt");
+
+$raw_message = do {
+ local $/;
+ <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+ $line =~ s/\r$//;
+ push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+}
+
+
+ok(cleanupdb());
+
+sub check_examined {
+ local ($_);
+ my $string = shift;
+
+ if (defined $string) {
+ $_ = $string;
+ } else {
+ $_ = join ('', <IN>);
+ }
+
+ if ($_ =~ /Learned from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
+ $found{'Learned from message'}++;
+ }
+}
+
+
+sub cleanupdb {
+ my $rv;
+ my $error = 0;
+
+ my $dbh = DBI->connect($dbdsn,$dbusername,$dbpassword);
+
+ if (!defined($dbh)) {
+ return 0;
+ }
+
+ $rv = $dbh->do("DELETE FROM bayes_vars WHERE username = ?", undef, $testuser);
+ if (!defined($rv)) {
+ $error = 1;
+ }
+ $rv = $dbh->do("DELETE FROM bayes_seen WHERE username = ?", undef, $testuser);
+ if (!defined($rv)) {
+ $error = 1;
+ }
+ $rv = $dbh->do("DELETE FROM bayes_token WHERE username = ?", undef, $testuser);
+ if (!defined($rv)) {
+ $error = 1;
+ }
+ $rv = $dbh->do("DELETE FROM bayes_expire WHERE username = ?", undef, $testuser);
+ return !$error;
+}

Added: incubator/spamassassin/trunk/t/sql_based_whitelist.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/sql_based_whitelist.t Thu Jan 29 18:54:33 2004
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest;
+
+use constant TEST_ENABLED => (-e 't/sql_based_whitelist.cf'
+ || -e 'sql_based_whitelist.cf');
+
+use Test;
+
+BEGIN { plan tests => (TEST_ENABLED ? 10 : 0),
+ onfail => sub {
+ warn "\n\nNote: Failure may be due to an incorrect config";
+ }
+ };
+
+exit unless TEST_ENABLED;
+
+sa_t_init("sql_based_whitelist");
+
+open(CONFIG,"<sql_based_whitelist.cf");
+while (my $line = <CONFIG>) {
+ $dbconfig .= $line;
+}
+close(CONFIG);
+
+tstlocalrules ("
+auto_whitelist_factory Mail::SpamAssassin::SQLBasedAddrList
+$dbconfig
+");
+
+# ---------------------------------------------------------------------------
+
+%is_nonspam_patterns = (
+q{ Subject: Re: [SAtalk] auto-whitelisting}, 'subj',
+);
+%is_spam_patterns = (
+q{Subject: 4000 Your Vacation Winning !}, 'subj',
+);
+
+%is_spam_patterns2 = (
+q{ X-Spam-Status: Yes}, 'status',
+);
+
+
+%patterns = %is_nonspam_patterns;
+
+ok (sarun ("--remove-addr-from-whitelist whitelist_test\@whitelist.spamassassin.taint.org", \&patterns_run_cb));
+
+# 3 times, to get into the whitelist:
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+
+# Now check
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok_all_patterns();
+
+%patterns = %is_spam_patterns;
+ok (sarun ("-L -t < data/spam/004", \&patterns_run_cb));
+ok_all_patterns();
+
+%patterns = %is_spam_patterns2;
+ok (sarun ("-L -t < data/spam/007", \&patterns_run_cb));
+ok_all_patterns();

Added: incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+# WARNING: This script is VERY rough and provided only as a template
+# for moving a DB based autowhitelist to a SQL based one. You should
+# do backups and that sort of thing before attempting to use this
+# script.
+
+use strict;
+use Fcntl;
+
+use Getopt::Long;
+
+use DBI;
+
+use DB_File ;
+use vars qw( %h $k $v ) ;
+
+sub usage {
+ print "This program takes the following required arguments:\n";
+ print "--username <username> - This is who's whitelist you are loading.\n";
+ print " It should match exactly what spassassin or\n";
+ print " spamd will be using.\n";
+ print "--dsn <dsn> - This is the database DSN. It should be in the form:\n";
+ print " DBI:driver:database:hostname[:port]\n";
+ print " Consult your database drivers docs for more info.\n";
+ print "--ok - Basically a sanity check that you understand how dangerous this script is.\n";
+ print "\n";
+ print "This program take the following optional arguments:\n";
+ print "--dbautowhitelist <path> - path to the auto-whitelist you wish to\n";
+ print " convert. Default is to use \n";
+ print " \$ENV{HOME}/.spamassassin/auto-whitelist\n";
+ print "--sqlusername <username> - Needed if your DBI driver requires a username.\n";
+ print "--sqlpassword <password> - Needed if your DBI driver requires a password.\n";
+ print "\n\n";
+ print "WARNING: This script is VERY rough and not well tested. You should\n";
+ print "use extreme caution when working with it. Including backing up your\n";
+ print "data and all that other good stuff.\n";
+ print "Passing of the --ok flag means you read this warning.\n";
+ print "\n";
+ exit;
+}
+
+my %opt;
+
+GetOptions('dsn=s' => \$opt{'dsn'},
+ 'sqlusername=s' => \$opt{'sqlusername'},
+ 'sqlpassword=s' => \$opt{'sqlpassword'},
+ 'dbautowhitelist=s' => \$opt{'dbautowhitelist'},
+ 'username=s' => \$opt{'username'},
+ 'help' => \$opt{'help'},
+ 'ok' => \$opt{'ok'},
+ );
+
+if ($opt{'help'}) {
+ usage();
+}
+
+if (!$opt{'ok'}) {
+ usage();
+}
+
+
+if (!$opt{'username'} || !$opt{'dsn'}) {
+ usage();
+}
+
+my $db;
+if ($opt{'dbautowhitelist'}) {
+ $db = $opt{'dbautowhitelist'};
+}
+else {
+ $db = $ENV{HOME}."/.spamassassin/auto-whitelist";
+}
+
+tie %h, "DB_File",$db, O_RDONLY,0600
+ or die "Cannot open file $db: $!\n";
+
+my $dbh = DBI->connect($opt{'dsn'}, $opt{'sqlusername'}, $opt{'sqlpassword'})
+ or die $DBI::errstr;
+
+my $sth = $dbh->prepare("DELETE FROM awl WHERE username = ?");
+$sth->execute($opt{'username'});
+
+my $sth = $dbh->prepare("INSERT INTO awl (username,email,ip,count,totscore) VALUES (?,?,?,?,?)");
+
+my @k = grep(!/totscore$/,keys(%h));
+for my $key (@k) {
+ my $totscore = $h{"$key|totscore"};
+ my $count = $h{$key};
+ if(defined($totscore)) {
+ my ($email, $ip) = split(/\|ip=/, $key);
+
+ if ($email && $ip) {
+ my $rc = $sth->execute($opt{'username'}, $email, $ip, $count, $totscore);
+ printf "% 8.1f %15s -- %s\n", $totscore/$count, (sprintf "(%.1f/%d)",$totscore/$count, $count), $key;
+ }
+ }
+}
+untie %h;
+$dbh->disconnect();

Added: incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql Thu Jan 29 18:54:33 2004
@@ -0,0 +1,242 @@
+#!/usr/bin/perl -w
+
+# WARNING: This script is VERY rough and provided only as a template
+# for moving the DB based bayes files to a SQL database. You should
+# do backups and that sort of thing before attempting to use this
+# script.
+
+# Also, no sort of locking is provided so it's suggested that you make
+# sure nothing is accessing the bayes files you are attempting to
+# convert.
+
+use strict;
+
+use DB_File;
+use DBI;
+
+use Getopt::Long;
+
+use vars qw( %toks_db %seen_db $opt_dbpath $opt_username $opt_dsn $opt_ok
+ $opt_dbusername $opt_dbpassword $opt_help $last_atime_delta
+ $last_expire $last_expire_reduce $ham_count $spam_count);
+
+
+# These are the magic tokens we use to track stuff in the DB.
+# The format is '^M^A^G^I^C' followed by any string you want.
+# None of the control chars will be in a real token.
+my $DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION";
+my $LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
+my $LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
+my $LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+my $LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
+my $NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
+my $NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
+my $NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
+my $NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
+my $OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
+my $RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
+
+
+GetOptions("dbpath=s",
+ "username=s",
+ "dsn=s",
+ "dbusername=s",
+ "dbpassword=s",
+ "help",
+ "ok");
+
+sub usage {
+ print "This program takes the following arguments:\n";
+ print "--username <username> - This is who's bayes data you are loading.\n";
+ print " It should match exactly what spassassin or\n";
+ print " spamd will be using.\n";
+ print "--dsn <dsn> - This is the database DSN. It should be in the form:\n";
+ print " DBI:driver:database:hostname[:port]\n";
+ print " Consult your database drivers docs for more info.\n";
+ print "--dbpath <path> - path to the bayes files you wish to\n";
+ print " convert. Default is to use \n";
+ print " \$ENV{HOME}/.spamassassin/bayes\n";
+ print " A _toks and _seen will be added to the given path.\n";
+ print "--dbusername <username> - Needed if your DBI driver requires a username.\n";
+ print "--dbpassword <password> - Needed if your DBI driver requires a password.\n";
+ print "--ok - Basically a sanity check that you understand how dangerous this script is.\n";
+ print "\n\n";
+ print "WARNING: This script is VERY rough and not well tested. You should\n";
+ print "use extreme caution when working with it. Including backing up your\n";
+ print "data and all that other good stuff.\n";
+ print "Passing of the --ok flag means you read this warning.\n";
+ print "\n";
+ exit;
+}
+
+usage() if ($opt_help);
+
+if (!$opt_ok) {
+ usage();
+}
+
+my $path = $opt_dbpath || $ENV{HOME}."/.spamassassin/bayes";
+my $dsn = $opt_dsn || "dbi:mysql:spamassassin:";
+
+my $username = $opt_username || `whoami`;
+chomp($username);
+
+my $toks_name = "${path}_toks";
+my $seen_name = "${path}_seen";
+
+print "Converting DBM bayes database to SQL database for $username.\n";
+
+my $dbh = DBI->connect($dsn, $opt_dbusername, $opt_dbpassword);
+
+unless ($dbh) {
+ print "Unable to connect to database ($dsn): ".DBI->errstr()."\n";
+ exit;
+}
+
+my ($varcount) = $dbh->selectrow_array("SELECT count(*) FROM bayes_vars WHERE username = '$username'");
+
+my ($tokcount) = $dbh->selectrow_array("SELECT count(*) FROM bayes_token WHERE username = '$username'");
+
+if ($varcount || $tokcount) {
+ print "User: $username has existing data, please remove then re-run.\n";
+ exit;
+}
+
+tie %toks_db, "DB_File", $toks_name, O_RDONLY, 0600
+ or die "Cannot open file $toks_name: $!\n";
+
+if ($toks_db{$DB_VERSION_MAGIC_TOKEN} != 2) {
+ print "This conversion script only works with version 2 bayes DBM files.\n";
+ exit;
+}
+
+my $sql = "INSERT INTO bayes_token (username, token, spam_count, ham_count, atime) values (?,?,?,?,?)";
+my $sth = $dbh->prepare($sql);
+
+my $tokens = 0;
+
+# Initalize a few variables in case we end up not finding them in the database.
+$last_atime_delta = 0;
+$last_expire = 0;
+$last_expire_reduce = 0;
+
+foreach my $key ( keys(%toks_db) ) {
+ next if ($key eq $DB_VERSION_MAGIC_TOKEN);
+ next if ($key eq $NHAM_MAGIC_TOKEN);
+ next if ($key eq $NSPAM_MAGIC_TOKEN);
+ next if ($key eq $RUNNING_EXPIRE_MAGIC_TOKEN);
+ next if ($key eq $NTOKENS_MAGIC_TOKEN);
+ next if ($key eq $LAST_JOURNAL_SYNC_MAGIC_TOKEN);
+ next if ($key eq $NEWEST_TOKEN_AGE_MAGIC_TOKEN);
+ next if ($key eq $OLDEST_TOKEN_AGE_MAGIC_TOKEN);
+
+ if ($key eq $LAST_ATIME_DELTA_MAGIC_TOKEN) {
+ $last_atime_delta = $toks_db{$LAST_ATIME_DELTA_MAGIC_TOKEN};
+ next;
+ }
+
+ if ($key eq $LAST_EXPIRE_MAGIC_TOKEN) {
+ $last_expire = $toks_db{$LAST_EXPIRE_MAGIC_TOKEN};
+ next;
+ }
+
+ if ($key eq $LAST_EXPIRE_REDUCE_MAGIC_TOKEN) {
+ $last_expire_reduce = $toks_db{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
+ next;
+ }
+
+ my ($spam, $ham, $atime) = &tok_unpack($toks_db{$key});
+ my $rc = $sth->execute($username, $key, $spam, $ham, $atime);
+ if ($rc) {
+ $tokens++;
+ }
+ else {
+ print "Error creating entry for: $key -- $spam -- $ham -- $atime\n";
+ }
+ $sth->finish();
+}
+
+tie %seen_db, "DB_File", $seen_name, O_RDONLY, 0600
+ or die "Cannot open file $toks_name: $!\n";
+
+$ham_count = 0;
+$spam_count = 0;
+
+$sql = "INSERT INTO bayes_seen (username, msgid, flag) VALUES (?,?,?)";
+$sth = $dbh->prepare($sql);
+
+foreach my $key (keys(%seen_db)) {
+ my $msgid = $key;
+ my $flag = $seen_db{$key};
+
+ next if ($flag ne 'h' && $flag ne 's');
+
+ my $rc = $sth->execute($username, $key, $flag);
+
+ if ($rc) {
+ if ($flag eq 'h') {
+ $ham_count++;
+ }
+ elsif ($flag eq 's') {
+ $spam_count++;
+ }
+ }
+ else {
+ print "Error creating entry for: $msgid -- $flag\n";
+ }
+ $sth->finish();
+}
+
+print "Token Count: $tokens\n";
+print "Ham Count: $ham_count\n";
+print "Spam Count: $spam_count\n";
+print "\nNOTE: It's possible that the above numbers may not match up exactly with\n";
+print "\n what an sa-learn --dump magic shows. Not sure why that is, but\n";
+print "\n as long as it's not a huge difference I wouldn't worry about it.\n";
+
+$sql = "INSERT INTO bayes_vars (username, spam_count, ham_count, last_expire, last_atime_delta, last_expire_reduce) VALUES (?,?,?,?,?,?)";
+
+$sth = $dbh->prepare($sql);
+
+my $rc = $sth->execute($username, $spam_count, $ham_count, $last_expire, $last_atime_delta, $last_expire_reduce);
+
+unless ($rc) {
+ print "Error updating bayes_vars: ".DBI->errstr()."\n";
+ exit;
+}
+
+print "Conversion done.\n";
+
+
+
+
+
+use constant FORMAT_FLAG => 0xc0; # 11000000
+use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
+use constant TWO_LONGS_FORMAT => 0x00; # 00000000
+
+use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
+use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
+
+sub tok_unpack {
+ my ($value) = @_;
+ $value ||= 0;
+
+ my ($packed, $atime) = unpack("CV", $value);
+
+ if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
+ return (($packed & ONE_BYTE_SSS_BITS) >> 3,
+ $packed & ONE_BYTE_HHH_BITS,
+ $atime || 0);
+ }
+ elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
+ my ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+ return ($ts || 0, $th || 0, $atime || 0);
+ }
+ # other formats would go here...
+ else {
+ warn "unknown packing format for Bayes db, please re-learn: $packed";
+ return (0, 0, 0);
+ }
+}
+