Mailing List Archive

svn commit: rev 6756 - in incubator/spamassassin/trunk: . lib/Mail/SpamAssassin
Author: jm
Date: Wed Feb 18 18:55:42 2004
New Revision: 6756

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/sa-learn.raw
Log:
bug 2273: bayes_ignore_from, bayes_ignore_to added, thanks to David Koppelman

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Wed Feb 18 18:55:42 2004
@@ -229,6 +229,7 @@
'main' => $main,
'conf' => $main->{conf},
'log_raw_counts' => 0,
+ 'use_ignores' => 1,
'tz' => Mail::SpamAssassin::Util::local_tz(),
};
bless ($self, $class);
@@ -645,11 +646,37 @@

###########################################################################

+sub ignore_message {
+ my ($Bayes,$PMS) = @_;
+
+ return 0 unless $Bayes->{use_ignores};
+
+ my $ignore = $PMS->check_from_in_list('bayes_ignore_from')
+ || $PMS->check_to_in_list('bayes_ignore_to');
+
+ dbg("Not using Bayes, bayes_ignore_from or _to rule") if $ignore;
+
+ return $ignore;
+}
+
+###########################################################################
+
sub learn {
my ($self, $isspam, $msg, $id) = @_;

if (!$self->{conf}->{use_bayes}) { return; }
if (!defined $msg) { return; }
+
+ if( $self->{use_ignores} ) # Remove test when PerMsgStatus available.
+ {
+ # DMK, koppel@ece.lsu.edu: Hoping that the ultimate fix to bug 2263 will
+ # make it unnecessary to construct a PerMsgStatus here.
+ my $PMS = new Mail::SpamAssassin::PerMsgStatus $self->{main}, $msg;
+ my $ignore = $self->ignore_message($PMS);
+ $PMS->finish();
+ return if $ignore;
+ }
+
my $body = $self->get_body_from_msg ($msg);
my $ret;

@@ -1028,9 +1055,11 @@
sub scan {
my ($self, $permsgstatus, $msg, $body) = @_;

- if ( !$self->is_scan_available() ) {
+ if( $self->ignore_message($permsgstatus) ) {
goto skip;
}
+
+ goto skip unless $self->is_scan_available();

my ($ns, $nn) = $self->{store}->nspam_nham_get();


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm Wed Feb 18 18:55:42 2004
@@ -39,6 +39,7 @@
my ($opts) = shift;

%opt = ( 'force-expire' => 0,
+ 'use-ignores' => 0,
'norebuild' => 0,
);

@@ -60,6 +61,7 @@
'no-rebuild|norebuild' => \$opt{'norebuild'},
'local|L' => \$opt{'local'},
'force-expire' => \$opt{'force-expire'},
+ 'use-ignores' => \$opt{'use-ignores'},

'stopafter=i' => \$opt{'stopafter'},
'learnprob=f' => \$opt{'learnprob'},
@@ -179,6 +181,8 @@
wait_for_lock => 1,
caller_will_untie => 1
});
+
+ $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};

if ($rebuildonly) {
$spamtest->rebuild_learner_caches({

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Wed Feb 18 18:55:42 2004
@@ -239,6 +239,8 @@
$self->{bayes_min_ham_num} = 200;
$self->{bayes_min_spam_num} = 200;
$self->{bayes_learn_during_report} = 1;
+ $self->{bayes_ignore_from} = { };
+ $self->{bayes_ignore_to} = { };

# Allow alternate bayes storage implementation
$self->{bayes_store_module} = '';
@@ -1619,6 +1621,46 @@
$self->{check_mx_delay} = $value+0; next;
}

+=item bayes_ignore_from add@ress.com
+
+Bayesian classification and autolearning will not be performed on mail
+from the listed addresses. Program C<sa-learn> will also ignore the
+listed addresses if it is invoked using the C<--use-ignores> option.
+One or more addresses can be listed, see C<whitelist_from>.
+
+Spam messages from certain senders may contain many words that
+frequently occur in ham. For example, one might read messages from a
+preferred bookstore but also get unwanted spam messages from other
+bookstores. If the unwanted messages are learned as spam then any
+messages discussing books, including the preferred bookstore and
+antiquarian messages would be in danger of being marked as spam. The
+addresses of the annoying bookstores would be listed. (Assuming they
+were halfway legitimate and didn't send you mail through myriad
+affiliates.)
+
+Those who have pieces of spam in legitimate messages or otherwise
+receive ham messages containing potentially spammy words might fear
+that some spam messages might be in danger of being marked as ham.
+The addresses of the spam mailing lists, correspondents, etc. would
+be listed.
+
+=cut
+
+
+ if (/^bayes_ignore_from\s+(.+)$/) {
+ $self->add_to_addrlist ('bayes_ignore_from', split (' ', $1)); next;
+ }
+
+=item bayes_ignore_to add@ress.com
+
+Bayesian classification and autolearning will not be performed on mail
+to the listed addresses. See C<bayes_ignore_from> for details.
+
+=cut
+
+ if (/^bayes_ignore_to\s+(.+)$/) {
+ $self->add_to_addrlist ('bayes_ignore_to', split (' ', $1)); next;
+ }

=item dns_available { yes | test[: name1 name2...] | no } (default: test)


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Wed Feb 18 18:55:42 2004
@@ -913,6 +913,35 @@

###########################################################################

+sub check_from_in_list {
+ my ($self,$list) = @_;
+ my $list_ref = $self->{conf}{$list};
+ warn "Could not find list $list" unless defined $list_ref;
+
+ foreach my $addr ( all_from_addrs $self ) {
+ return 1 if _check_whitelist $self $list_ref, $addr;
+ }
+
+ return 0;
+}
+
+###########################################################################
+
+sub check_to_in_list {
+ my ($self,$list) = @_;
+ my $list_ref = $self->{conf}{$list};
+ warn "Could not find list $list" unless defined $list_ref;
+
+ foreach my $addr ( all_to_addrs $self ) {
+ return 1 if _check_whitelist $self $list_ref, $addr;
+ }
+
+ return 0;
+}
+
+
+###########################################################################
+
sub check_from_in_whitelist {
my ($self) = @_;
local ($_);

Modified: incubator/spamassassin/trunk/sa-learn.raw
==============================================================================
--- incubator/spamassassin/trunk/sa-learn.raw (original)
+++ incubator/spamassassin/trunk/sa-learn.raw Wed Feb 18 18:55:42 2004
@@ -76,6 +76,7 @@
--ham Learn messages as ham (non-spam)
--spam Learn messages as spam
--forget Forget a message
+ --use-ignores Use bayes_ignore_from and bayes_ignore_to
--rebuild Rebuild the database if needed
--force-expire Force an expiry run, rebuild every time
--dbpath <path> Allows commandline override (in bayes_path form)
@@ -334,6 +335,14 @@
them this time around. If the messages have already been filtered through
SpamAssassin, the learner will ignore any modifications SpamAssassin may have
made.
+
+=item B<--use-ignore>
+
+Don't learn the message if a from address matches configuration file
+item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
+The option might be used when learning from a large file of messages
+from which the hammy spam messages or spammy ham messages have not
+been removed.

=item B<--rebuild>