Mailing List Archive: svn commit: rev 9840 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin

Author: felicity
Date: Thu Apr 1 15:47:48 2004
New Revision: 9840

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
Log:
reworked the mpart_alt_diff code a little bit, added more comments, added code to handle html w/ no words and only images...

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Thu Apr 1 15:47:48 2004
@@ -3286,34 +3286,54 @@

sub _multipart_alternative_difference {
my($self) = @_;
+ $self->{madiff} = 0;

+ # Find all multipart/alternative parts in the message
my @ma = $self->{msg}->find_parts(qr@^multipart/alternative\b@i);
- my @content = $self->{msg}->content_summary();

- $self->{madiff} = 0;
+ # If there are no multipart/alternative sections, skip this test.
+ return if (!@ma);

- # Exchange meeting requests come in as m/a text/html text/calendar ...
- # Ignore any messages without a multipart/alternative section as well ...
- if ( !@ma || (@content == 3 && $content[2] eq 'text/calendar' &&
- $content[1] eq 'text/html' &&
- $content[0] eq 'multipart/alternative') ) {
+ # Figure out what the MIME content of the message looks like
+ my @content = $self->{msg}->content_summary();
+
+ # Exchange meeting requests come in as m/a text/html text/calendar,
+ # which we want to ignore because of the high FP rate it would cause.
+ #
+ if (@content == 3 && $content[2] eq 'text/calendar' &&
+ $content[1] eq 'text/html' &&
+ $content[0] eq 'multipart/alternative') {
return;
}

- # Only deal with text/plain and text/html ...
+ # Go through each of the multipart parts
foreach my $part ( @ma ) {
my %html = ();
my %text = ();

+ # limit our search to text-based parts
my @txt = $part->find_parts(qr@^text\b@i);
foreach my $text ( @txt ) {
+ # we only care about the rendered version of the part
my($type, $rnd) = $text->rendered();

+ # parse the rendered text into tokens. assume they are whitespace
+ # separated, and ignore anything that doesn't have a word-character
+ # in it (0-9a-zA-Z_) since those are probably things like bullet
+ # points, horizontal lines, etc. this assumes that punctuation
+ # in one part will be the same in other parts.
+ #
if ( $type eq 'text/html' ) {
foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
#dbg("HTML: $w");
$html{$w}++;
}
+
+ # If there are no words, mark if there's at least 1 image ...
+ if (keys %html == 0 && exists $self->{html}{"inside_img"}) {
+ # Use "\n" as the mark since it can't ever occur normally
+ $html{"\n"}=1;
+ }
}
else {
foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
@@ -3323,15 +3343,22 @@
}
}

+ # How many HTML tokens do we have at the start?
my $orig = keys %html;
next if ( $orig == 0 );

+ # If the token appears at least as many times in the text part as
+ # in the html part, remove it from the list of html tokens.
while( my($k,$v) = each %text ) {
delete $html{$k} if ( exists $html{$k} && $html{$k}-$text{$k} < 1 );
}

#map { dbg("LEFT: $_") } keys %html;

+ # In theory, the tokens should be the same in both text and html
+ # parts, so there would be 0 tokens left in the html token list, for
+ # a 0% difference rate. Calculate it here, and record the difference
+ # if it's been the highest so far in this message.
my $diff = scalar(keys %html)/$orig*100;
$self->{madiff} = $diff if ( $diff > $self->{madiff} );