Mailing List Archive

svn commit: rev 6329 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Tue Jan 27 14:50:11 2004
New Revision: 6329

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
bug 2211: tests to detect invalid HTML tags


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Tue Jan 27 14:50:11 2004
@@ -30,7 +30,7 @@
my @EXPORT_OK = qw();

use HTML::Parser 3.24 ();
-use vars qw($re_start $re_loose $re_strict);
+use vars qw($re_start $re_loose $re_strict $re_other);

# elements that trigger HTML rendering in text/plain in some mail clients
# (repeats ones listed in $re_strict)
@@ -43,6 +43,9 @@
# loose list of HTML events
my $events = 'on(?:activate|afterupdate|beforeactivate|beforecopy|beforecut|beforedeactivate|beforeeditfocus|beforepaste|beforeupdate|blur|change|click|contextmenu|controlselect|copy|cut|dblclick|deactivate|errorupdate|focus|focusin|focusout|help|keydown|keypress|keyup|load|losecapture|mousedown|mouseenter|mouseleave|mousemove|mouseout|mouseover|mouseup|mousewheel|move|moveend|movestart|paste|propertychange|readystatechange|reset|resize|resizeend|resizestart|select|submit|timeerror|unload)';

+# other non-standard tags
+$re_other = 'o:\w+/?|x-sigsep|x-tab';
+
my %tested_colors;

sub new {
@@ -146,10 +149,15 @@
sub html_tag {
my ($self, $tag, $attr, $num) = @_;

- $self->{html}{"inside_$tag"} += $num;
-
- $self->{html}{elements}++ if $tag =~ /^(?:$re_strict|$re_loose)$/io;
+ if ($tag =~ /^(?:$re_strict|$re_loose|$re_other)$/io) {
+ $self->{html}{elements}++;
+ $self->{html}{elements_seen}++ if !exists $self->{html}{"inside_$tag"};
+ }
$self->{html}{tags}++;
+ $self->{html}{tags_seen}++ if !exists $self->{html}{"inside_$tag"};
+
+ $self->{html}{"inside_$tag"} += $num;
+ $self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;

if ($tag =~ /^(?:body|table|tr|th|td)$/) {
$self->html_bgcolor($tag, $attr, $num);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Tue Jan 27 14:50:11 2004
@@ -293,6 +293,12 @@
if (exists $self->{html_results}{total_comment_length} && $self->{html_results}{non_uri_len} > 0) {
$self->{html_results}{total_comment_ratio} = $self->{html_results}{total_comment_length} / $self->{html_results}{non_uri_len};
}
+ if (exists $self->{html_results}{tags}) {
+ $self->{html_results}{t_bad_tag_ratio} = ($self->{html_results}{tags} - $self->{html_results}{elements}) / $self->{html_results}{tags};
+ $self->{html_results}{t_bad_tag_count} = ($self->{html_results}{tags} - $self->{html_results}{elements});
+ $self->{html_results}{t_bad_tag_unique_ratio} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen}) / $self->{html_results}{tags_seen};
+ $self->{html_results}{t_bad_tag_unique_count} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen});
+ }
}
else {
$self->{'rendered_type'} = $self->{'type'};

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf (original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf Tue Jan 27 14:50:11 2004
@@ -317,6 +317,46 @@
body T_HTML_FONT_TINY eval:html_test('t_tiny_font')
describe T_HTML_FONT_TINY HTML has a tiny font

+# HTML bad tag percentage
+body T_HTML_BADTAGS_00_10 eval:html_range('t_bad_tag_ratio','0.00','0.10')
+body T_HTML_BADTAGS_10_20 eval:html_range('t_bad_tag_ratio','0.10','0.20')
+body T_HTML_BADTAGS_20_30 eval:html_range('t_bad_tag_ratio','0.20','0.30')
+body T_HTML_BADTAGS_30_40 eval:html_range('t_bad_tag_ratio','0.30','0.40')
+body T_HTML_BADTAGS_40_50 eval:html_range('t_bad_tag_ratio','0.40','0.50')
+body T_HTML_BADTAGS_50_60 eval:html_range('t_bad_tag_ratio','0.50','0.60')
+body T_HTML_BADTAGS_60_70 eval:html_range('t_bad_tag_ratio','0.60','0.70')
+body T_HTML_BADTAGS_70_80 eval:html_range('t_bad_tag_ratio','0.70','0.80')
+body T_HTML_BADTAGS_80_90 eval:html_range('t_bad_tag_ratio','0.80','0.90')
+body T_HTML_BADTAGS_90_100 eval:html_range('t_bad_tag_ratio','0.90','1.00')
+
+# HTML bad tag count
+body T_HTML_BADTAGS_GT_0 eval:html_range('t_bad_tag_count','0','4')
+body T_HTML_BADTAGS_GT_4 eval:html_range('t_bad_tag_count','4','8')
+body T_HTML_BADTAGS_GT_8 eval:html_range('t_bad_tag_count','8','16')
+body T_HTML_BADTAGS_GT_16 eval:html_range('t_bad_tag_count','16','32')
+body T_HTML_BADTAGS_GT_32 eval:html_range('t_bad_tag_count','32','64')
+body T_HTML_BADTAGS_GT_64 eval:html_range('t_bad_tag_count','64','inf')
+
+# HTML bad tag percentage for unique tags
+body T_HTML_BADTAGS_U_00_10 eval:html_range('t_bad_tag_unique_ratio','0.00','0.10')
+body T_HTML_BADTAGS_U_10_20 eval:html_range('t_bad_tag_unique_ratio','0.10','0.20')
+body T_HTML_BADTAGS_U_20_30 eval:html_range('t_bad_tag_unique_ratio','0.20','0.30')
+body T_HTML_BADTAGS_U_30_40 eval:html_range('t_bad_tag_unique_ratio','0.30','0.40')
+body T_HTML_BADTAGS_U_40_50 eval:html_range('t_bad_tag_unique_ratio','0.40','0.50')
+body T_HTML_BADTAGS_U_50_60 eval:html_range('t_bad_tag_unique_ratio','0.50','0.60')
+body T_HTML_BADTAGS_U_60_70 eval:html_range('t_bad_tag_unique_ratio','0.60','0.70')
+body T_HTML_BADTAGS_U_70_80 eval:html_range('t_bad_tag_unique_ratio','0.70','0.80')
+body T_HTML_BADTAGS_U_80_90 eval:html_range('t_bad_tag_unique_ratio','0.80','0.90')
+body T_HTML_BADTAGS_U_90_100 eval:html_range('t_bad_tag_unique_ratio','0.90','1.00')
+
+# HTML bad tag count for unique tags
+body T_HTML_BADTAGS_U_GT_0 eval:html_range('t_bad_tag_unique_count','0','4')
+body T_HTML_BADTAGS_U_GT_4 eval:html_range('t_bad_tag_unique_count','4','8')
+body T_HTML_BADTAGS_U_GT_8 eval:html_range('t_bad_tag_unique_count','8','16')
+body T_HTML_BADTAGS_U_GT_16 eval:html_range('t_bad_tag_unique_count','16','32')
+body T_HTML_BADTAGS_U_GT_32 eval:html_range('t_bad_tag_unique_count','32','64')
+body T_HTML_BADTAGS_U_GT_64 eval:html_range('t_bad_tag_unique_count','64','inf')
+
# more portable replacement for RCVD_NUMERIC_HELO that doesn't rely on
# Received headers using "helo=" prefix
header T_RCVD_NUMERIC_HELO X-Spam-Relays-Untrusted =~ / helo=\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} /