Mailing List Archive

svn commit: rev 6733 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin t
Author: felicity
Date: Wed Feb 18 15:26:01 2004
New Revision: 6733

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
incubator/spamassassin/trunk/t/bayesdbm.t
Log:
bug 3055: spammers are using the same message id to get around bayes being able to learn different messages. make the hash message-id the default now, but be backwards compatible with the seen db.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Wed Feb 18 15:26:01 2004
@@ -684,35 +684,49 @@
# this function is trapped by the wrapper above
sub learn_trapped {
my ($self, $isspam, $msg, $body, $msgid) = @_;
+ my @msgid = ( $msgid );

- $msgid ||= $self->get_msgid ($msg);
- my $seen = $self->{store}->seen_get ($msgid);
- if (defined ($seen)) {
- if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
- dbg ("$msgid: already learnt correctly, not learning twice");
- return 0;
- } elsif ($seen !~ /^[hs]$/) {
- warn ("db_seen corrupt: value='$seen' for $msgid. ignored");
- } else {
- dbg ("$msgid: already learnt as opposite, forgetting first");
-
- # kluge so that forget() won't untie the db on us ...
- my $orig = $self->{main}->{learn_caller_will_untie};
- $self->{main}->{learn_caller_will_untie} = 1;
-
- my $fatal = !defined $self->forget ($msg);
+ if (!defined $msgid) {
+ @msgid = $self->get_msgid($msg);
+ }

- # reset the value post-forget() ...
- $self->{main}->{learn_caller_will_untie} = $orig;
+ foreach $msgid ( @msgid ) {
+ my $seen = $self->{store}->seen_get ($msgid);

- # forget() gave us a fatal error, so propagate that up
- if ($fatal) {
- dbg("forget() returned a fatal error, so learn() will too");
- return;
+ if (defined ($seen)) {
+ if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
+ dbg ("$msgid: already learnt correctly, not learning twice");
+ return 0;
+ } elsif ($seen !~ /^[hs]$/) {
+ warn ("db_seen corrupt: value='$seen' for $msgid. ignored");
+ } else {
+ dbg ("$msgid: already learnt as opposite, forgetting first");
+
+ # kluge so that forget() won't untie the db on us ...
+ my $orig = $self->{main}->{learn_caller_will_untie};
+ $self->{main}->{learn_caller_will_untie} = 1;
+
+ my $fatal = !defined $self->forget ($msg);
+
+ # reset the value post-forget() ...
+ $self->{main}->{learn_caller_will_untie} = $orig;
+
+ # forget() gave us a fatal error, so propagate that up
+ if ($fatal) {
+ dbg("forget() returned a fatal error, so learn() will too");
+ return;
+ }
}
+
+ # we're only going to have seen this once, so stop if it's been
+ # seen already
+ last;
}
}

+ # Now that we're sure we haven't seen this message before ...
+ $msgid = $msgid[0];
+
if ($isspam) {
$self->{store}->nspam_nham_change (1, 0);
} else {
@@ -790,27 +804,44 @@
# this function is trapped by the wrapper above
sub forget_trapped {
my ($self, $msg, $body, $msgid) = @_;
-
- $msgid ||= $self->get_msgid ($msg);
- my $seen = $self->{store}->seen_get ($msgid);
+ my @msgid = ( $msgid );
my $isspam;
- if (defined ($seen)) {
- if ($seen eq 's') {
- $isspam = 1;
- } elsif ($seen eq 'h') {
- $isspam = 0;
- } else {
- dbg ("forget: message $msgid seen entry is neither ham nor spam, ignored");
- return 0;
+
+ if (!defined $msgid) {
+ @msgid = $self->get_msgid($msg);
+ }
+
+ while( $msgid = shift @msgid ) {
+ my $seen = $self->{store}->seen_get ($msgid);
+
+ if (defined ($seen)) {
+ if ($seen eq 's') {
+ $isspam = 1;
+ } elsif ($seen eq 'h') {
+ $isspam = 0;
+ } else {
+ dbg ("forget: msgid $msgid seen entry is neither ham nor spam, ignored");
+ return 0;
+ }
+
+ # messages should only be learned once, so stop if we find a msgid
+ # which was seen before
+ last;
+ }
+ else {
+ dbg ("forget: msgid $msgid not learnt, ignored");
}
- } else {
- dbg ("forget: message $msgid not learnt, ignored");
- return 0;
}

- if ($isspam) {
+ # This message wasn't learnt before, so return
+ if (!defined $isspam) {
+ dbg("forget: no msgid from this message has been learnt, skipping message");
+ return 0;
+ }
+ elsif ($isspam) {
$self->{store}->nspam_nham_change (-1, 0);
- } else {
+ }
+ else {
$self->{store}->nspam_nham_change (0, -1);
}

@@ -836,32 +867,36 @@
sub get_msgid {
my ($self, $msg) = @_;

+ my @msgid = ();
+
my $msgid = $msg->get_header("Message-Id");
- if (!defined $msgid || $msgid eq '' || $msgid =~ /^\s*<\s*>.*$/) { # generate a best effort unique id
- # Use sha1(Date:, last received: and top N bytes of body)
- # where N is MIN(1024 bytes, 1/2 of body length)
- #
- my $date = $msg->get_header("Date");
- $date = "None" if (!defined $date || $date eq ''); # No Date?
+ if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
+ # remove \r and < and > prefix/suffixes
+ chomp $msgid;
+ $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+ push(@msgid, $msgid);
+ }

- my @rcvd = $msg->get_header("Received");
- my $rcvd = $rcvd[$#rcvd];
- $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
+ # Use sha1(Date:, last received: and top N bytes of body)
+ # where N is MIN(1024 bytes, 1/2 of body length)
+ #
+ my $date = $msg->get_header("Date");
+ $date = "None" if (!defined $date || $date eq ''); # No Date?

- my $body = $msg->get_pristine_body();
- if (length($body) > 64) { # Small Body?
- my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
- substr($body, $keep) = '';
- }
+ my @rcvd = $msg->get_header("Received");
+ my $rcvd = $rcvd[$#rcvd];
+ $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?

- $msgid = sha1($date."\000".$rcvd."\000".$body).'@sa_generated';
+ # Make a copy since pristine_body is a reference ...
+ my $body = join('', $msg->get_pristine_body());
+ if (length($body) > 64) { # Small Body?
+ my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
+ substr($body, $keep) = '';
}

- # remove \r and < and > prefix/suffixes
- chomp $msgid;
- $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+ unshift(@msgid, sha1($date."\000".$rcvd."\000".$body).'@sa_generated');

- $msgid;
+ return wantarray ? @msgid : $msgid[0];
}

sub add_uris_for_permsgstatus {
@@ -886,7 +921,7 @@

if (!defined $body) {
# why?!
- warn "failed to get body for ".$self->get_msgid($self->{msg})."\n";
+ warn "failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
return [ ];
}


Modified: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm.t (original)
+++ incubator/spamassassin/trunk/t/bayesdbm.t Wed Feb 18 15:26:01 2004
@@ -16,7 +16,7 @@
unshift(@INC, '../blib/lib');
}

- plan tests => (HAS_DB_FILE ? 43 : 0);
+ plan tests => (HAS_DB_FILE ? 44 : 0);
};

exit unless HAS_DB_FILE;
@@ -68,9 +68,11 @@

ok(scalar(@toks) > 0);

-my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);

-ok($msgid eq '9PS291LhupY');
+# $msgid is the generated hash messageid, $msgid_hdr is the Message-Id header ...
+ok($msgid eq '502e12b89b9c74074744ffc18a95d80cff2effcd@sa_generated');
+ok($msgid_hdr eq '9PS291LhupY');

ok($sa->{bayes_scanner}->{store}->tie_db_writable());