Mailing List Archive

svn commit: rev 9858 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Sun Apr 4 00:46:16 2004
New Revision: 9858

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/rules/20_html_tests.cf
incubator/spamassassin/trunk/rules/70_testing.cf
Log:
replace HTML_IMAGE_ONLY* with new rules
promote new HTML_SHORT_* rules
remove T_HTML_TAGS_* for poor performance
remove T_HTML_SHORT_IMG_* since it is the same as HTML_IMAGE_ONLY


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Sun Apr 4 00:46:16 2004
@@ -3167,10 +3167,9 @@
my ($self, undef, $min, $max) = @_;

return (exists $self->{html}{"inside_img"} &&
- exists $self->{html}{non_space_len} &&
- $self->{html}{non_space_len} > $min &&
- $self->{html}{non_space_len} <= $max &&
- $self->get('X-eGroups-Return') !~ /^sentto-.*\@returns\.groups\.yahoo\.com$/);
+ exists $self->{html}{length} &&
+ $self->{html}{length} > $min &&
+ $self->{html}{length} <= $max);
}

sub html_image_ratio {
@@ -3181,15 +3180,6 @@
$self->{html}{image_area} > 0);
my $ratio = $self->{html}{non_space_len} / $self->{html}{image_area};
return ($ratio > $min && $ratio <= $max);
-}
-
-sub html_image_only2 {
- my ($self, undef, $min, $max) = @_;
-
- return (exists $self->{html}{"inside_img"} &&
- exists $self->{html}{length} &&
- $self->{html}{length} > $min &&
- $self->{html}{length} <= $max);
}

sub html_charset_faraway {

Modified: incubator/spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_html_tests.cf (original)
+++ incubator/spamassassin/trunk/rules/20_html_tests.cf Sun Apr 4 00:46:16 2004
@@ -167,19 +167,19 @@
describe HTML_IMAGE_AREA_08 HTML has 8-9 kilopixels of images
describe HTML_IMAGE_AREA_09 HTML has over 9 kilopixels of images

-# HTML_IMAGE_ONLY - not much text with images (absolute)
-body HTML_IMAGE_ONLY_02 eval:html_image_only('0000','0200')
-body HTML_IMAGE_ONLY_04 eval:html_image_only('0200','0400')
-body HTML_IMAGE_ONLY_06 eval:html_image_only('0400','0600')
-body HTML_IMAGE_ONLY_08 eval:html_image_only('0600','0800')
-body HTML_IMAGE_ONLY_10 eval:html_image_only('0800','1000')
-body HTML_IMAGE_ONLY_12 eval:html_image_only('1000','1200')
-describe HTML_IMAGE_ONLY_02 HTML: images with 0-200 bytes of words
-describe HTML_IMAGE_ONLY_04 HTML: images with 200-400 bytes of words
-describe HTML_IMAGE_ONLY_06 HTML: images with 400-600 bytes of words
-describe HTML_IMAGE_ONLY_08 HTML: images with 600-800 bytes of words
-describe HTML_IMAGE_ONLY_10 HTML: images with 800-1000 bytes of words
-describe HTML_IMAGE_ONLY_12 HTML: images with 1000-1200 bytes of words
+# HTML_IMAGE_ONLY - not much raw HTML with images (absolute)
+body HTML_IMAGE_ONLY_04 eval:html_image_only('0000','0400')
+body HTML_IMAGE_ONLY_08 eval:html_image_only('0400','0800')
+body HTML_IMAGE_ONLY_12 eval:html_image_only('0800','1200')
+body HTML_IMAGE_ONLY_16 eval:html_image_only('1200','1600')
+body HTML_IMAGE_ONLY_20 eval:html_image_only('1600','2000')
+body HTML_IMAGE_ONLY_24 eval:html_image_only('2000','2400')
+describe HTML_IMAGE_ONLY_04 HTML: images with 0-400 bytes of words
+describe HTML_IMAGE_ONLY_08 HTML: images with 400-800 bytes of words
+describe HTML_IMAGE_ONLY_12 HTML: images with 800-1200 bytes of words
+describe HTML_IMAGE_ONLY_16 HTML: images with 1200-1600 bytes of words
+describe HTML_IMAGE_ONLY_20 HTML: images with 1600-2000 bytes of words
+describe HTML_IMAGE_ONLY_24 HTML: images with 2000-2400 bytes of words

# HTML_IMAGE_RATIO - more image area than text (ratio)
body HTML_IMAGE_RATIO_02 eval:html_image_ratio('0.000','0.002')
@@ -312,6 +312,20 @@
describe HTML_NONELEMENT_70_80 70% to 80% of HTML elements are non-standard
describe HTML_NONELEMENT_80_90 80% to 90% of HTML elements are non-standard
describe HTML_NONELEMENT_90_100 90% to 100% of HTML elements are non-standard
+
+# short HTML messages with certain attributes
+body HTML_SHORT_LENGTH eval:html_eval('length', '< 170')
+describe HTML_SHORT_LENGTH HTML is extremely short
+
+body __HTML_LENGTH_512 eval:html_eval('length', '< 512')
+body __COMMENT_EXISTS eval:html_text('comment', '=~ /<!.*?>/')
+meta HTML_SHORT_COMMENT (__HTML_LENGTH_512 && __COMMENT_EXISTS)
+describe HTML_SHORT_COMMENT HTML is very short with HTML comments
+
+body __HTML_LENGTH_384 eval:html_eval('length', '< 384')
+body __TAG_EXISTS_CENTER eval:html_tag_exists('center')
+meta HTML_SHORT_CENTER (__HTML_LENGTH_384 && __TAG_EXISTS_CENTER)
+describe HTML_SHORT_CENTER HTML is very short with CENTER tag

body HTML_TITLE_EMPTY eval:html_text('title', '!~ /\S/s')
describe HTML_TITLE_EMPTY HTML title contains no text

Modified: incubator/spamassassin/trunk/rules/70_testing.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_testing.cf (original)
+++ incubator/spamassassin/trunk/rules/70_testing.cf Sun Apr 4 00:46:16 2004
@@ -499,79 +499,3 @@
header __HAS_SUBJECT exists:Subject
meta T_MISSING_SUBJECT !__HAS_SUBJECT

-# short raw HTML length
-# these are used below
-body T_HTML_LENGTH_32 eval:html_eval('length', '< 32')
-body T_HTML_LENGTH_64 eval:html_eval('length', '< 64')
-body T_HTML_LENGTH_128 eval:html_eval('length', '< 128')
-body T_HTML_LENGTH_256 eval:html_eval('length', '< 256')
-body T_HTML_LENGTH_384 eval:html_eval('length', '< 384')
-body T_HTML_LENGTH_512 eval:html_eval('length', '< 512')
-body T_HTML_LENGTH_768 eval:html_eval('length', '< 768')
-body T_HTML_LENGTH_1024 eval:html_eval('length', '< 1024')
-# probe a bit more in the sweet spot between 128 and 256
-body T_HTML_LENGTH_130 eval:html_eval('length', '< 130')
-body T_HTML_LENGTH_140 eval:html_eval('length', '< 140')
-body T_HTML_LENGTH_150 eval:html_eval('length', '< 150')
-body T_HTML_LENGTH_160 eval:html_eval('length', '< 160')
-body T_HTML_LENGTH_170 eval:html_eval('length', '< 170')
-body T_HTML_LENGTH_180 eval:html_eval('length', '< 180')
-body T_HTML_LENGTH_180 eval:html_eval('length', '< 180')
-body T_HTML_LENGTH_190 eval:html_eval('length', '< 190')
-body T_HTML_LENGTH_200 eval:html_eval('length', '< 200')
-body T_HTML_LENGTH_210 eval:html_eval('length', '< 210')
-body T_HTML_LENGTH_220 eval:html_eval('length', '< 220')
-body T_HTML_LENGTH_230 eval:html_eval('length', '< 230')
-body T_HTML_LENGTH_240 eval:html_eval('length', '< 240')
-body T_HTML_LENGTH_250 eval:html_eval('length', '< 250')
-
-# rules combining short length with some element that is not
-# typical of short raw HTML
-# COMMENT looks best so far
-# IMG is good, FPs due to to web bugs like http://www.msgtag.com/
-# CENTER is good too, FPs seem more random (inline attachments)
-
-body __COMMENT_EXISTS eval:html_text('comment', '=~ /<!.*?>/')
-meta T_HTML_SHORT_COMMENT_32 T_HTML_LENGTH_32 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_64 T_HTML_LENGTH_64 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_128 T_HTML_LENGTH_128 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_256 T_HTML_LENGTH_256 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_384 T_HTML_LENGTH_384 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_512 T_HTML_LENGTH_512 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_768 T_HTML_LENGTH_768 && __COMMENT_EXISTS
-meta T_HTML_SHORT_COMMENT_1024 T_HTML_LENGTH_1024 && __COMMENT_EXISTS
-
-body __TAG_EXISTS_IMG eval:html_tag_exists('img')
-meta T_HTML_SHORT_IMG_32 T_HTML_LENGTH_32 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_64 T_HTML_LENGTH_64 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_128 T_HTML_LENGTH_128 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_256 T_HTML_LENGTH_256 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_384 T_HTML_LENGTH_384 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_512 T_HTML_LENGTH_512 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_768 T_HTML_LENGTH_768 && __TAG_EXISTS_IMG
-meta T_HTML_SHORT_IMG_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_IMG
-
-body __TAG_EXISTS_CENTER eval:html_tag_exists('center')
-meta T_HTML_SHORT_CENTER_32 T_HTML_LENGTH_32 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_64 T_HTML_LENGTH_64 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_128 T_HTML_LENGTH_128 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_256 T_HTML_LENGTH_256 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_384 T_HTML_LENGTH_384 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_512 T_HTML_LENGTH_512 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_768 T_HTML_LENGTH_768 && __TAG_EXISTS_CENTER
-meta T_HTML_SHORT_CENTER_1024 T_HTML_LENGTH_1024 && __TAG_EXISTS_CENTER
-
-# HTML with very few tags
-body T_HTML_TAGS_2 eval:html_eval('tags', '< 2')
-body T_HTML_TAGS_4 eval:html_eval('tags', '< 4')
-body T_HTML_TAGS_6 eval:html_eval('tags', '< 6')
-body T_HTML_TAGS_8 eval:html_eval('tags', '< 8')
-
-# looking-good-so-far replacements for HTML_IMAGE_ONLY
-# these use raw HTML length rather than rendered word length
-body T_HTML_IMAGE_ONLY2_04 eval:html_image_only2('0000','0400')
-body T_HTML_IMAGE_ONLY2_08 eval:html_image_only2('0400','0800')
-body T_HTML_IMAGE_ONLY2_12 eval:html_image_only2('0800','1200')
-body T_HTML_IMAGE_ONLY2_16 eval:html_image_only2('1200','1600')
-body T_HTML_IMAGE_ONLY2_20 eval:html_image_only2('1600','2000')
-body T_HTML_IMAGE_ONLY2_24 eval:html_image_only2('2000','2400')