Mailing List Archive

svn commit: rev 9849 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Fri Apr 2 17:13:59 2004
New Revision: 9849

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/rules/20_body_tests.cf
incubator/spamassassin/trunk/rules/70_testing.cf
Log:
revisions of the new HTML rules
remove the T_LONGWORDS tests


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Fri Apr 2 17:13:59 2004
@@ -3192,16 +3192,6 @@
$self->{html}{length} <= $max);
}

-sub html_image_only3 {
- my ($self, undef, $min, $max) = @_;
-
- return (exists $self->{html}{"inside_img"} &&
- exists $self->{html}{length} &&
- $self->{html}{length} > $min &&
- $self->{html}{length} <= $max &&
- $self->get('X-eGroups-Return') !~ /^sentto-.*\@returns\.groups\.yahoo\.com$/);
-}
-
sub html_charset_faraway {
my ($self) = @_;


Modified: incubator/spamassassin/trunk/rules/20_body_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_body_tests.cf (original)
+++ incubator/spamassassin/trunk/rules/20_body_tests.cf Fri Apr 2 17:13:59 2004
@@ -201,10 +201,12 @@
body DOMAIN_RATIO eval:check_domain_ratio('0.026')
describe DOMAIN_RATIO Message body mentions many internet domains

+# If these are too expensive as a whole, then delete __LONGWORDS_B and
+# __LONGWORDS_C and replace with (__LONGWORDS_D || __LONGWORDS_A) which
+# is very close in quality.
body __LONGWORDS_A /\b(?:[a-z]{8,}\s+){6}/
body __LONGWORDS_B /\b(?:[a-z]{7,}\s+){8}/
body __LONGWORDS_C /\b(?:[a-z]{6,}\s+){9}/
body __LONGWORDS_D /\b(?:[a-z]{5,}\s+){10}/
-
meta LONGWORDS (__LONGWORDS_A || __LONGWORDS_B || __LONGWORDS_C || __LONGWORDS_D)
describe LONGWORDS Long string of long words

Modified: incubator/spamassassin/trunk/rules/70_testing.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_testing.cf (original)
+++ incubator/spamassassin/trunk/rules/70_testing.cf Fri Apr 2 17:13:59 2004
@@ -492,14 +492,6 @@
header __MAILMAN_21 X-Mailman-Version =~ /\d/
meta T_FORGED_MUA_THEBAT_BOUN (__THEBAT_MUA && !__THEBAT_MUA_V2 && __CTYPE_HAS_BOUNDARY && !__BAT_BOUNDARY && !__MAILMAN_21)

-# some test replacements for meta rule
-meta T_LONGWORDS_1 (__LONGWORDS_D || (__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C > 1))
-meta T_LONGWORDS_2 (__LONGWORDS_D || (__LONGWORDS_A && __LONGWORDS_B && __LONGWORDS_C))
-meta T_LONGWORDS_3 (__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C + __LONGWORDS_D > 1)
-meta T_LONGWORDS_4 (__LONGWORDS_D || __LONGWORDS_A)
-meta T_LONGWORDS_5 (__LONGWORDS_D || __LONGWORDS_B)
-meta T_LONGWORDS_6 (__LONGWORDS_D || __LONGWORDS_C)
-
# bug 3209
# tvd: 6.754 6.8409 0.0000 1.000 1.00 0.01 T_MISSING_SUBJECT
header __HAS_SUBJECT exists:Subject
@@ -513,6 +505,7 @@
body T_HTML_LENGTH_256 eval:html_eval('length', '< 256')
body T_HTML_LENGTH_384 eval:html_eval('length', '< 384')
body T_HTML_LENGTH_512 eval:html_eval('length', '< 512')
+body T_HTML_LENGTH_768 eval:html_eval('length', '< 768')
body T_HTML_LENGTH_1024 eval:html_eval('length', '< 1024')
# probe a bit more in the sweet spot between 128 and 256
body T_HTML_LENGTH_130 eval:html_eval('length', '< 130')
@@ -532,24 +525,19 @@

# rules combining short length with some element that is not
# typical of short raw HTML
-
-body __HTML_COMMENT1 eval:html_text('comment', '=~ /<!.*?>/')
-meta T_HTML_SHORT_COMMENT1_32 T_HTML_LENGTH_32 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_64 T_HTML_LENGTH_64 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_128 T_HTML_LENGTH_128 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_256 T_HTML_LENGTH_256 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_384 T_HTML_LENGTH_384 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_512 T_HTML_LENGTH_512 && __HTML_COMMENT1
-meta T_HTML_SHORT_COMMENT1_1024 T_HTML_LENGTH_1024 && __HTML_COMMENT1
-
-body __HTML_COMMENT2 eval:html_text('comment', '=~ /<!(?!doctype).*?>/i')
-meta T_HTML_SHORT_COMMENT2_32 T_HTML_LENGTH_32 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_64 T_HTML_LENGTH_64 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_128 T_HTML_LENGTH_128 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_256 T_HTML_LENGTH_256 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_384 T_HTML_LENGTH_384 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_512 T_HTML_LENGTH_512 && __HTML_COMMENT2
-meta T_HTML_SHORT_COMMENT2_1024 T_HTML_LENGTH_1024 && __HTML_COMMENT2
+# COMMENT looks best so far
+# IMG is good, FPs due to to web bugs like http://www.msgtag.com/
+# CENTER is good too, FPs seem more random (inline attachments)
+
+body __COMMENT_EXISTS eval:html_text('comment', '=~ /<!.*?>/')
+meta T_HTML_SHORT_COMMENT_32 T_HTML_LENGTH_32 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_64 T_HTML_LENGTH_64 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_128 T_HTML_LENGTH_128 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_256 T_HTML_LENGTH_256 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_384 T_HTML_LENGTH_384 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_512 T_HTML_LENGTH_512 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_768 T_HTML_LENGTH_768 && __COMMENT_EXISTS
+meta T_HTML_SHORT_COMMENT_1024 T_HTML_LENGTH_1024 && __COMMENT_EXISTS

body __TAG_EXISTS_IMG eval:html_tag_exists('img')
meta T_HTML_SHORT_IMG_32 T_HTML_LENGTH_32 && __TAG_EXISTS_IMG
@@ -558,6 +546,7 @@
meta T_HTML_SHORT_IMG_256 T_HTML_LENGTH_256 && __TAG_EXISTS_IMG
meta T_HTML_SHORT_IMG_384 T_HTML_LENGTH_384 && __TAG_EXISTS_IMG
meta T_HTML_SHORT_IMG_512 T_HTML_LENGTH_512 && __TAG_EXISTS_IMG
+meta T_HTML_SHORT_IMG_768 T_HTML_LENGTH_768 && __TAG_EXISTS_IMG
meta T_HTML_SHORT_IMG_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_IMG

body __TAG_EXISTS_CENTER eval:html_tag_exists('center')
@@ -567,65 +556,20 @@
meta T_HTML_SHORT_CENTER_256 T_HTML_LENGTH_256 && __TAG_EXISTS_CENTER
meta T_HTML_SHORT_CENTER_384 T_HTML_LENGTH_384 && __TAG_EXISTS_CENTER
meta T_HTML_SHORT_CENTER_512 T_HTML_LENGTH_512 && __TAG_EXISTS_CENTER
+meta T_HTML_SHORT_CENTER_768 T_HTML_LENGTH_768 && __TAG_EXISTS_CENTER
meta T_HTML_SHORT_CENTER_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_CENTER

-meta T_HTML_SHORT_BODY_32 T_HTML_LENGTH_32 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_64 T_HTML_LENGTH_64 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_128 T_HTML_LENGTH_128 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_256 T_HTML_LENGTH_256 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_384 T_HTML_LENGTH_384 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_512 T_HTML_LENGTH_512 && __TAG_EXISTS_BODY
-meta T_HTML_SHORT_BODY_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_BODY
-
-meta T_HTML_SHORT_HTML_32 T_HTML_LENGTH_32 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_64 T_HTML_LENGTH_64 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_128 T_HTML_LENGTH_128 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_256 T_HTML_LENGTH_256 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_384 T_HTML_LENGTH_384 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_512 T_HTML_LENGTH_512 && __TAG_EXISTS_HTML
-meta T_HTML_SHORT_HTML_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_HTML
-
-body __TAG_EXISTS_BR eval:html_tag_exists('br')
-meta T_HTML_SHORT_BR_32 T_HTML_LENGTH_32 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_64 T_HTML_LENGTH_64 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_128 T_HTML_LENGTH_128 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_256 T_HTML_LENGTH_256 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_384 T_HTML_LENGTH_384 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_512 T_HTML_LENGTH_512 && __TAG_EXISTS_BR
-meta T_HTML_SHORT_BR_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_BR
-
-body __TAG_EXISTS_A eval:html_tag_exists('a')
-meta T_HTML_SHORT_A_32 T_HTML_LENGTH_32 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_64 T_HTML_LENGTH_64 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_128 T_HTML_LENGTH_128 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_256 T_HTML_LENGTH_256 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_384 T_HTML_LENGTH_384 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_512 T_HTML_LENGTH_512 && __TAG_EXISTS_A
-meta T_HTML_SHORT_A_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_A
-
# HTML with very few tags
body T_HTML_TAGS_2 eval:html_eval('tags', '< 2')
body T_HTML_TAGS_4 eval:html_eval('tags', '< 4')
+body T_HTML_TAGS_6 eval:html_eval('tags', '< 6')
body T_HTML_TAGS_8 eval:html_eval('tags', '< 8')
-body T_HTML_TAGS_10 eval:html_eval('tags', '< 10')
-body T_HTML_TAGS_12 eval:html_eval('tags', '< 12')
-body T_HTML_TAGS_14 eval:html_eval('tags', '< 14')
-body T_HTML_TAGS_16 eval:html_eval('tags', '< 16')
-body T_HTML_TAGS_32 eval:html_eval('tags', '< 32')
-body T_HTML_TAGS_64 eval:html_eval('tags', '< 64')

-# possible replacements for HTML_IMAGE_ONLY
+# looking-good-so-far replacements for HTML_IMAGE_ONLY
# these use raw HTML length rather than rendered word length
-body T_HTML_IMAGE_ONLY2_02 eval:html_image_only2('0000','0200')
-body T_HTML_IMAGE_ONLY2_04 eval:html_image_only2('0200','0400')
-body T_HTML_IMAGE_ONLY2_06 eval:html_image_only2('0400','0600')
-body T_HTML_IMAGE_ONLY2_08 eval:html_image_only2('0600','0800')
-body T_HTML_IMAGE_ONLY2_10 eval:html_image_only2('0800','1000')
-body T_HTML_IMAGE_ONLY2_12 eval:html_image_only2('1000','1200')
-
-body T_HTML_IMAGE_ONLY3_02 eval:html_image_only3('0000','0200')
-body T_HTML_IMAGE_ONLY3_04 eval:html_image_only3('0200','0400')
-body T_HTML_IMAGE_ONLY3_06 eval:html_image_only3('0400','0600')
-body T_HTML_IMAGE_ONLY3_08 eval:html_image_only3('0600','0800')
-body T_HTML_IMAGE_ONLY3_10 eval:html_image_only3('0800','1000')
-body T_HTML_IMAGE_ONLY3_12 eval:html_image_only3('1000','1200')
+body T_HTML_IMAGE_ONLY2_04 eval:html_image_only2('0000','0400')
+body T_HTML_IMAGE_ONLY2_08 eval:html_image_only2('0400','0800')
+body T_HTML_IMAGE_ONLY2_12 eval:html_image_only2('0800','1200')
+body T_HTML_IMAGE_ONLY2_16 eval:html_image_only2('1200','1600')
+body T_HTML_IMAGE_ONLY2_20 eval:html_image_only2('1600','2000')
+body T_HTML_IMAGE_ONLY2_24 eval:html_image_only2('2000','2400')