Mailing List Archive

svn commit: rev 6629 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Wed Feb 11 23:16:17 2004
New Revision: 6629

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
incubator/spamassassin/trunk/rules/20_body_tests.cf
incubator/spamassassin/trunk/rules/20_html_tests.cf
incubator/spamassassin/trunk/rules/20_ratware.cf
incubator/spamassassin/trunk/rules/70_testing.cf
Log:
promote T_MPART_ALT_DIFF_99 to MPART_ALT_DIFF (lowest FNs by far)
promote T_HTML_BADTAGS_* to HTML_BADTAGS_*
promote T_HTML_BADTAGS_U_* to HTML_NONELEMENT_*
promote T_HTML_MESSAGE_1 to HTML_MESSAGE
add T_HTML_LINK_UNCLICKABLE (anchor with no image or text)
add HTML_OBFUSCATION_* descriptions
make MsgContainer post-rendering texts somewhat more readable
move RATWARE_HASH_DASH to 20_ratware.cf


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Wed Feb 11 23:16:17 2004
@@ -3357,14 +3357,6 @@
return exists $self->{html}{$test} && eval "qq{\Q$self->{html}{$test}\E} $expr";
}

-sub html_message {
- my ($self) = @_;
-
- return (exists $self->{html}{elements} &&
- ($self->{html}{elements} >= 8 ||
- $self->{html}{elements} >= $self->{html}{tags} / 2));
-}
-
sub html_title {
my ($self, undef, $expr) = @_;
for my $title (@{ $self->{html}{t_title} }) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Wed Feb 11 23:16:17 2004
@@ -175,6 +175,11 @@

$self->{html_last_tag} = $tag;
}
+ if ($num == -1) {
+ if ($tag eq "a") {
+ $self->{html}{anchor_unclickable}++ if $self->{html}{anchor_empty};
+ }
+ }

if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
$self->{html}{shouting} += $num;
@@ -641,7 +646,12 @@
{
$self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
}
+ if ($tag eq "img") {
+ # might as well always clear this here
+ $self->{html}{anchor_empty} = 0;
+ }

+ $self->{html}{anchor_empty} = 1 if ($tag eq "a" && exists $attr->{href});
$self->{html}{anchor_text} ||= "" if ($tag eq "a");
}

@@ -659,6 +669,7 @@

if (exists $self->{html}{"inside_a"} && $self->{html}{"inside_a"} > 0) {
$self->{html}{anchor_text} .= " $text";
+ $self->{html}{anchor_empty} = 0;
}

if (exists $self->{html}{"inside_script"} && $self->{html}{"inside_script"} > 0)

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Wed Feb 11 23:16:17 2004
@@ -283,8 +283,8 @@
my $text = $self->decode();
my $raw = length($text);

- # render text/html always, or any other text|text/plain part as text/html based
- # on a heuristic which simulates a certain common mail client
+ # render text/html always, or any other text|text/plain part as text/html
+ # based on a heuristic which simulates a certain common mail client
if ( $raw > 0 && (
$self->{'type'} =~ m@^text/html\b@i || (
$self->{'type'} =~ m@^text(?:$|/plain)@i &&
@@ -293,38 +293,36 @@
)
) {
$self->{'rendered_type'} = 'text/html';
- my $html = Mail::SpamAssassin::HTML->new(); # object
+ my $html = Mail::SpamAssassin::HTML->new(); # object
my @lines = @{$html->html_render($text)};
- $self->{rendered} = join('', @{$html->html_render($text)}); # rendered text
- $self->{html_results} = $html->get_results(); # needed in eval tests
+ $self->{rendered} = join('', @{$html->html_render($text)});
+ $self->{html_results} = $html->get_results(); # needed in eval tests

+ # some tests done after rendering
+ my $r = $self->{html_results}; # temporary reference for brevity
my $space = 0;
- $self->{html_results}{non_uri_len} = 0;
+ $r->{non_uri_len} = 0;
for my $line (@lines) {
$line = pack ('C0A*', $line);
$space += ($line =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
- $self->{html_results}{non_uri_len} += length($line);
+ $r->{non_uri_len} += length($line);
for my $uri ($line =~ m/\b(URI:\S+)/g) {
- $self->{html_results}{non_uri_len} -= length($uri);
+ $r->{non_uri_len} -= length($uri);
}
}
- $self->{html_results}{non_space_len} = $self->{html_results}{non_uri_len} - $space;
- $self->{html_results}{ratio} = ($raw - $self->{html_results}{non_uri_len}) / $raw;
- if (exists $self->{html_results}{total_comment_length} && $self->{html_results}{non_uri_len} > 0) {
- $self->{html_results}{total_comment_ratio} = $self->{html_results}{total_comment_length} / $self->{html_results}{non_uri_len};
+ $r->{non_space_len} = $r->{non_uri_len} - $space;
+ $r->{ratio} = ($raw - $r->{non_uri_len}) / $raw;
+ if (exists $r->{total_comment_length} && $r->{non_uri_len} > 0) {
+ $r->{total_comment_ratio} =
+ $r->{total_comment_length} / $r->{non_uri_len};
}
- if (exists $self->{html_results}{elements} &&
- exists $self->{html_results}{tags})
- {
- $self->{html_results}{t_bad_tag_ratio} = ($self->{html_results}{tags} - $self->{html_results}{elements}) / $self->{html_results}{tags};
- $self->{html_results}{t_bad_tag_count} = ($self->{html_results}{tags} - $self->{html_results}{elements});
- $self->{html_results}{t_bad_tag_unique_ratio} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen}) / $self->{html_results}{tags_seen};
- $self->{html_results}{t_bad_tag_unique_count} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen});
+ if (exists $r->{elements} && exists $r->{tags}) {
+ $r->{bad_tag_ratio} = ($r->{tags} - $r->{elements}) / $r->{tags};
+ $r->{non_element_ratio} =
+ ($r->{tags_seen} - $r->{elements_seen}) / $r->{tags_seen};
}
- if (exists $self->{html_results}{tags} &&
- exists $self->{html_results}{obfuscation})
- {
- $self->{html_results}{obfuscation_ratio} = $self->{html_results}{obfuscation} / $self->{html_results}{tags};
+ if (exists $r->{tags} && exists $r->{obfuscation}) {
+ $r->{obfuscation_ratio} = $r->{obfuscation} / $r->{tags};
}
}
else {

Modified: incubator/spamassassin/trunk/rules/20_body_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_body_tests.cf (original)
+++ incubator/spamassassin/trunk/rules/20_body_tests.cf Wed Feb 11 23:16:17 2004
@@ -194,6 +194,11 @@
describe MIME_CHARSET_FARAWAY MIME character set indicates foreign language
tflags MIME_CHARSET_FARAWAY userconf

+# This rule uses a simple algorithm to determine if the text and html
+# parts of an multipart/alternative message are different.
+body MPART_ALT_DIFF eval:multipart_alternative_difference('99', '100')
+describe MPART_ALT_DIFF HTML and text parts are different
+
###########################################################################

body CHARSET_FARAWAY eval:check_for_faraway_charset()
@@ -207,20 +212,6 @@
body BODY_8BITS eval:check_for_body_8bits()
describe BODY_8BITS Body includes 8 consecutive 8-bit characters
tflags BODY_8BITS userconf
-
-# Send-Safe ratware (idea from Alan Curry)
-# random alphanumerics, separated into groups of 16 by dashes (the first
-# and last group may be shorter), with a lowercase "l" and a number
-# appended. The final number is the length of the whole string not
-# including the dashes or the "l<number>". Why? I have no idea. It's
-# not a tracking code - the spamware does not save it locally.
-#
-# jm: it's specifically to throw off MIME base64 encoding, to evade AOL's
-# filters.
-#
-# http://groups.google.com/groups?selm=atp1ip0n22%40enews3.newsguy.com
-rawbody RATWARE_HASH_DASH /[a-z\d]+-([a-z\d]{16}-)+[a-z\d]+(?-i:l)\d+/i
-describe RATWARE_HASH_DASH Contains a hashbuster in Send-Safe format

# duncf
body EMAIL_ROT13 /\b[a-z(\]-]+\^[a-z-]+\([a-z]{2,3}\b/

Modified: incubator/spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_html_tests.cf (original)
+++ incubator/spamassassin/trunk/rules/20_html_tests.cf Wed Feb 11 23:16:17 2004
@@ -29,7 +29,7 @@
# please sort these by eval type then name

# HTML control test, HTML spam rules should all have better S/O than this
-body HTML_MESSAGE eval:html_message()
+body HTML_MESSAGE eval:html_test('ratio')
describe HTML_MESSAGE HTML included in message

# the HTML percentage range
@@ -223,6 +223,16 @@
body HTML_OBFUSCATION_70_80 eval:html_range('obfuscation_ratio','.7','.8')
body HTML_OBFUSCATION_80_90 eval:html_range('obfuscation_ratio','.8','.9')
body HTML_OBFUSCATION_90_100 eval:html_range('obfuscation_ratio','.9','1.0')
+describe HTML_OBFUSCATION_00_10 Message is 0% to 10% HTML obfuscation
+describe HTML_OBFUSCATION_10_20 Message is 10% to 20% HTML obfuscation
+describe HTML_OBFUSCATION_20_30 Message is 20% to 30% HTML obfuscation
+describe HTML_OBFUSCATION_30_40 Message is 30% to 40% HTML obfuscation
+describe HTML_OBFUSCATION_40_50 Message is 40% to 50% HTML obfuscation
+describe HTML_OBFUSCATION_50_60 Message is 50% to 60% HTML obfuscation
+describe HTML_OBFUSCATION_60_70 Message is 60% to 70% HTML obfuscation
+describe HTML_OBFUSCATION_70_80 Message is 70% to 80% HTML obfuscation
+describe HTML_OBFUSCATION_80_90 Message is 80% to 90% HTML obfuscation
+describe HTML_OBFUSCATION_90_100 Message is 90% to 100% HTML obfuscation

# many spammers seem to do this nowadays (and probably track
# their customers with it). (contrib: WW)
@@ -270,6 +280,50 @@

body HTML_TAG_EXISTS_TBODY eval:html_tag_exists('tbody')
describe HTML_TAG_EXISTS_TBODY HTML has "tbody" tag
+
+# percentage of tags that are not legal elements in HTML
+body HTML_BADTAG_00_10 eval:html_range('bad_tag_ratio','0.00','0.10')
+body HTML_BADTAG_10_20 eval:html_range('bad_tag_ratio','0.10','0.20')
+body HTML_BADTAG_20_30 eval:html_range('bad_tag_ratio','0.20','0.30')
+body HTML_BADTAG_30_40 eval:html_range('bad_tag_ratio','0.30','0.40')
+body HTML_BADTAG_40_50 eval:html_range('bad_tag_ratio','0.40','0.50')
+body HTML_BADTAG_50_60 eval:html_range('bad_tag_ratio','0.50','0.60')
+body HTML_BADTAG_60_70 eval:html_range('bad_tag_ratio','0.60','0.70')
+body HTML_BADTAG_70_80 eval:html_range('bad_tag_ratio','0.70','0.80')
+body HTML_BADTAG_80_90 eval:html_range('bad_tag_ratio','0.80','0.90')
+body HTML_BADTAG_90_100 eval:html_range('bad_tag_ratio','0.90','1.00')
+describe HTML_BADTAG_00_10 HTML message is 0% to 10% bad tags
+describe HTML_BADTAG_10_20 HTML message is 10% to 20% bad tags
+describe HTML_BADTAG_20_30 HTML message is 20% to 30% bad tags
+describe HTML_BADTAG_30_40 HTML message is 30% to 40% bad tags
+describe HTML_BADTAG_40_50 HTML message is 40% to 50% bad tags
+describe HTML_BADTAG_50_60 HTML message is 50% to 60% bad tags
+describe HTML_BADTAG_60_70 HTML message is 60% to 70% bad tags
+describe HTML_BADTAG_70_80 HTML message is 70% to 80% bad tags
+describe HTML_BADTAG_80_90 HTML message is 80% to 90% bad tags
+describe HTML_BADTAG_90_100 HTML message is 90% to 100% bad tags
+
+# percentage of unique non-elements in HTML
+body HTML_NONELEMENT_00_10 eval:html_range('non_element_ratio','0.00','0.10')
+body HTML_NONELEMENT_10_20 eval:html_range('non_element_ratio','0.10','0.20')
+body HTML_NONELEMENT_20_30 eval:html_range('non_element_ratio','0.20','0.30')
+body HTML_NONELEMENT_30_40 eval:html_range('non_element_ratio','0.30','0.40')
+body HTML_NONELEMENT_40_50 eval:html_range('non_element_ratio','0.40','0.50')
+body HTML_NONELEMENT_50_60 eval:html_range('non_element_ratio','0.50','0.60')
+body HTML_NONELEMENT_60_70 eval:html_range('non_element_ratio','0.60','0.70')
+body HTML_NONELEMENT_70_80 eval:html_range('non_element_ratio','0.70','0.80')
+body HTML_NONELEMENT_80_90 eval:html_range('non_element_ratio','0.80','0.90')
+body HTML_NONELEMENT_90_100 eval:html_range('non_element_ratio','0.90','1.00')
+describe HTML_NONELEMENT_00_10 0% to 10% of HTML elements are non-standard
+describe HTML_NONELEMENT_10_20 10% to 20% of HTML elements are non-standard
+describe HTML_NONELEMENT_20_30 20% to 30% of HTML elements are non-standard
+describe HTML_NONELEMENT_30_40 30% to 40% of HTML elements are non-standard
+describe HTML_NONELEMENT_40_50 40% to 50% of HTML elements are non-standard
+describe HTML_NONELEMENT_50_60 50% to 60% of HTML elements are non-standard
+describe HTML_NONELEMENT_60_70 60% to 70% of HTML elements are non-standard
+describe HTML_NONELEMENT_70_80 70% to 80% of HTML elements are non-standard
+describe HTML_NONELEMENT_80_90 80% to 90% of HTML elements are non-standard
+describe HTML_NONELEMENT_90_100 90% to 100% of HTML elements are non-standard

body HTML_TITLE_EMPTY eval:html_eval('title_text', '!~ /\S/s')
describe HTML_TITLE_EMPTY HTML title contains no text

Modified: incubator/spamassassin/trunk/rules/20_ratware.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_ratware.cf (original)
+++ incubator/spamassassin/trunk/rules/20_ratware.cf Wed Feb 11 23:16:17 2004
@@ -233,3 +233,16 @@
meta RATWARE_EXISCAN_FORGED (__RATWARE_EXISCAN && __RATWARE_ANTIABUSE && __HAS_MSMAIL_PRI)
describe RATWARE_EXISCAN_FORGED Headers indicate forged Exiscan message

+# Send-Safe ratware (idea from Alan Curry)
+# random alphanumerics, separated into groups of 16 by dashes (the first
+# and last group may be shorter), with a lowercase "l" and a number
+# appended. The final number is the length of the whole string not
+# including the dashes or the "l<number>". Why? I have no idea. It's
+# not a tracking code - the spamware does not save it locally.
+#
+# jm: it's specifically to throw off MIME base64 encoding, to evade AOL's
+# filters.
+#
+# http://groups.google.com/groups?selm=atp1ip0n22%40enews3.newsguy.com
+rawbody RATWARE_HASH_DASH /[a-z\d]+-([a-z\d]{16}-)+[a-z\d]+(?-i:l)\d+/i
+describe RATWARE_HASH_DASH Contains a hashbuster in Send-Safe format

Modified: incubator/spamassassin/trunk/rules/70_testing.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_testing.cf (original)
+++ incubator/spamassassin/trunk/rules/70_testing.cf Wed Feb 11 23:16:17 2004
@@ -318,34 +318,6 @@
body T_HTML_FONT_TINY eval:html_test('t_tiny_font')
describe T_HTML_FONT_TINY HTML has a tiny font

-# HTML bad tag percentage
-body T_HTML_BADTAGS_00_10 eval:html_range('t_bad_tag_ratio','0.00','0.10')
-body T_HTML_BADTAGS_10_20 eval:html_range('t_bad_tag_ratio','0.10','0.20')
-body T_HTML_BADTAGS_20_30 eval:html_range('t_bad_tag_ratio','0.20','0.30')
-body T_HTML_BADTAGS_30_40 eval:html_range('t_bad_tag_ratio','0.30','0.40')
-body T_HTML_BADTAGS_40_50 eval:html_range('t_bad_tag_ratio','0.40','0.50')
-body T_HTML_BADTAGS_50_60 eval:html_range('t_bad_tag_ratio','0.50','0.60')
-body T_HTML_BADTAGS_60_70 eval:html_range('t_bad_tag_ratio','0.60','0.70')
-body T_HTML_BADTAGS_70_80 eval:html_range('t_bad_tag_ratio','0.70','0.80')
-body T_HTML_BADTAGS_80_90 eval:html_range('t_bad_tag_ratio','0.80','0.90')
-body T_HTML_BADTAGS_90_100 eval:html_range('t_bad_tag_ratio','0.90','1.00')
-
-# HTML bad tag percentage for unique tags
-body T_HTML_BADTAGS_U_00_10 eval:html_range('t_bad_tag_unique_ratio','0.00','0.10')
-body T_HTML_BADTAGS_U_10_20 eval:html_range('t_bad_tag_unique_ratio','0.10','0.20')
-body T_HTML_BADTAGS_U_20_30 eval:html_range('t_bad_tag_unique_ratio','0.20','0.30')
-body T_HTML_BADTAGS_U_30_40 eval:html_range('t_bad_tag_unique_ratio','0.30','0.40')
-body T_HTML_BADTAGS_U_40_50 eval:html_range('t_bad_tag_unique_ratio','0.40','0.50')
-body T_HTML_BADTAGS_U_50_60 eval:html_range('t_bad_tag_unique_ratio','0.50','0.60')
-body T_HTML_BADTAGS_U_60_70 eval:html_range('t_bad_tag_unique_ratio','0.60','0.70')
-body T_HTML_BADTAGS_U_70_80 eval:html_range('t_bad_tag_unique_ratio','0.70','0.80')
-body T_HTML_BADTAGS_U_80_90 eval:html_range('t_bad_tag_unique_ratio','0.80','0.90')
-body T_HTML_BADTAGS_U_90_100 eval:html_range('t_bad_tag_unique_ratio','0.90','1.00')
-
-# possible HTML_MESSAGE replacements
-body T_HTML_MESSAGE_1 eval:html_test('ratio')
-body T_HTML_MESSAGE_2 eval:html_test('tags')
-
# more portable replacement for RCVD_NUMERIC_HELO that doesn't rely on
# Received headers using "helo=" prefix
header T_RCVD_NUMERIC_HELO X-Spam-Relays-Untrusted =~ / helo=\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} /
@@ -369,27 +341,6 @@
uri T_URI_UNPRINTABLE /%0/
describe T_URI_UNPRINTABLE URI contains unprintable characters

-# These rules use a simple algorithm to determine if the text and html
-# parts of an multipart/alternative message are different...
-# Even with the simple algorithm, it's amazing! - 2004.01.14, tvd
-#
-#OVERALL% SPAM% HAM% S/O RANK SCORE NAME
-# 140567 59829 80738 0.426 0.00 0.00 (all messages)
-#100.000 42.5626 57.4374 0.426 0.00 0.00 (all messages as %)
-# 29.308 68.8529 0.0050 1.000 1.00 0.01 T_MPART_ALT_DIFF_99
-# 29.353 68.9549 0.0074 1.000 1.00 0.01 T_MPART_ALT_DIFF_98
-# 29.429 69.1220 0.0149 1.000 1.00 0.01 T_MPART_ALT_DIFF_97
-# 29.470 69.2123 0.0198 1.000 1.00 0.01 T_MPART_ALT_DIFF_96
-# 29.483 69.2323 0.0285 1.000 1.00 0.01 T_MPART_ALT_DIFF_95
-# 29.771 69.8892 0.0421 0.999 1.00 0.01 T_MPART_ALT_DIFF_90
-#
-body T_MPART_ALT_DIFF_90 eval:multipart_alternative_difference('90', '100')
-body T_MPART_ALT_DIFF_95 eval:multipart_alternative_difference('95', '100')
-body T_MPART_ALT_DIFF_96 eval:multipart_alternative_difference('96', '100')
-body T_MPART_ALT_DIFF_97 eval:multipart_alternative_difference('97', '100')
-body T_MPART_ALT_DIFF_98 eval:multipart_alternative_difference('98', '100')
-body T_MPART_ALT_DIFF_99 eval:multipart_alternative_difference('99', '100')
-
# 0 nonspam hits, hundreds of spam hits. Serious problems there
uri T_TERRA_ES /terra\.es\//i
describe T_TERRA_ES Contains images or links to pages hosted at 'terra.es'
@@ -749,3 +700,6 @@
body T_OPEN_UNDEF_BODY eval:html_order('open', 'undef', 'body')

########################################################################
+
+body T_HTML_LINK_UNCLICKABLE eval:html_test('anchor_unclickable')
+describe T_HTML_LINK_UNCLICKABLE HTML link is not clickable