Mailing List Archive

svn commit: rev 6569 - incubator/spamassassin/trunk/tools
Author: jm
Date: Sat Feb 7 15:28:52 2004
New Revision: 6569

Modified:
incubator/spamassassin/trunk/tools/check_whitelist
Log:
added POD documentation and --clean switch to tools/check_whitelist, allowing AWL to be cleaned of single-hit old entries. fixes bug 1883: record number of emails received in AWL; bug 2103: AWL needs a way to be cleaned out

Modified: incubator/spamassassin/trunk/tools/check_whitelist
==============================================================================
--- incubator/spamassassin/trunk/tools/check_whitelist (original)
+++ incubator/spamassassin/trunk/tools/check_whitelist Sat Feb 7 15:28:52 2004
@@ -1,14 +1,32 @@
#!/usr/bin/perl
+#
+# TODO: should this be made a top-level script, called "sa-awl"?
+
+sub usage {
+ die "
+usage: check_whitelist [--clean] [--min n] [dbfile]
+";
+}

use strict;
use Fcntl;
+use Getopt::Long;
+
+use vars qw(
+ $opt_clean $opt_min $opt_help
+ );

-# must match line at top of lib/Mail/SpamAssassin/DBBasedAddrList.pm.
-# now off until 3.0
-# BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
+GetOptions(
+ 'clean' => \$opt_clean,
+ 'min:i' => \$opt_min,
+ 'help' => \$opt_help
+) or usage();
+$opt_help and usage();

+$opt_min ||= 2;
+
+BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
use AnyDBM_File ;
-use vars qw( %h $k $v ) ;

my $db;
if ($#ARGV == -1) {
@@ -17,17 +35,94 @@
$db = $ARGV[0];
}

-tie %h, "AnyDBM_File",$db, O_RDONLY,0600
+my %h;
+if ($opt_clean) {
+ tie %h, "AnyDBM_File",$db, O_RDWR,0600
+ or die "Cannot open r/w file $db: $!\n";
+} else {
+ tie %h, "AnyDBM_File",$db, O_RDONLY,0600
or die "Cannot open file $db: $!\n";
+}
+
my @k = grep(!/totscore$/,keys(%h));
for my $key (@k)
{
- my $t = $h{"$key|totscore"};
- my $v = $h{$key};
- if(defined($t)) {
- printf "% 8.1f %15s -- %s\n",
- $t/$v, (sprintf "(%.1f/%d)",$t/$v,$v),
+ my $totscore = $h{"$key|totscore"};
+ my $count = $h{$key};
+ next unless defined($totscore);
+
+ if ($opt_clean) {
+ if ($count >= $opt_min) { next; }
+ print "cleaning: ";
+ }
+
+ printf "% 8.1f %15s -- %s\n",
+ $totscore/$count, (sprintf "(%.1f/%d)",$totscore,$count),
$key;
+
+ if ($opt_clean) {
+ delete $h{"$key|totscore"};
+ delete $h{$key};
}
}
untie %h;
+
+=head1 NAME
+
+check_whitelist - examine and manipulate SpamAssassin's auto-whitelist db
+
+=head1 SYNOPSIS
+
+B<check_whitelist> [--clean] [--min n] [dbfile]
+
+=head1 DESCRIPTION
+
+Check or clean a SpamAssassin auto-whitelist (AWL) database file.
+
+The name of the file is specified after any options, as C<dbfile>.
+The default is C<$HOME/.spamassassin/auto-whitelist>.
+
+=head1 OPTIONS
+
+=over 4
+
+=item --clean
+
+Clean out infrequently-used AWL entries. The C<--min> switch can be
+used to select the threshold at which entries are kept or deleted.
+
+=item --min n
+
+Select the threshold at which entries are kept or deleted when C<--clean> is
+used. The default is C<2>, so entries that have only been seen once are
+deleted.
+
+=back
+
+=head1 OUTPUT
+
+The output looks like this:
+
+ AVG (TOTSCORE/COUNT) -- EMAIL|ip=IPBASE
+
+For example:
+
+ 0.0 (0.0/7) -- dawson@example.com|ip=208.192
+ 21.8 (43.7/2) -- mcdaniel_2s2000@example.com|ip=200.106
+
+C<AVG> is the average score; C<TOTSCORE> is the total score of all mails seen
+so far; C<COUNT> is the number of messages seen from that sender; C<EMAIL> is
+the sender's email address, and C<IPBASE> is the B<AWL base IP address>.
+
+B<AWL base IP address> is a way to identify the sender's IP address they
+frequently send from, in an approximate way, but remaining hard for spammers to
+spoof. The algorithm is as follows:
+
+ - take the last Received header that contains a public IP address -- namely
+ one which is not in private, unrouted IP space.
+
+ - chop off the last two octets, assuming that the user may be in an ISP's
+ dynamic address pool.
+
+=cut
+