Mailing List Archive

svn commit: rev 6710 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin
Author: quinlan
Date: Tue Feb 17 12:55:39 2004
New Revision: 6710

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
Log:
rewrite font attribute handling:
- fix invisibility FPs in Theo's corpus
- add handling of text attributes in addition to bgcolor and fgcolor


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Tue Feb 17 12:55:39 2004
@@ -46,6 +46,19 @@
# other non-standard tags
$re_other = 'o:\w+/?|x-sigsep|x-tab';

+# style attributes
+my %ok_attribute = (
+ text => [qw(body)],
+ color => [qw(basefont font)],
+ bgcolor => [qw(body table tr td th marquee)],
+ face => [qw(basefont font)],
+ size => [qw(basefont font)],
+ link => [qw(body)],
+ alink => [qw(body)],
+ vlink => [qw(body)],
+ background => [qw(body marquee)],
+ );
+
my %tested_colors;

sub new {
@@ -62,13 +75,12 @@
sub html_init {
my ($self) = @_;

- push @{ $self->{bgcolor_color} }, "#ffffff";
- push @{ $self->{bgcolor_tag} }, "default";
- push @{ $self->{fgcolor_color} }, "#000000";
- push @{ $self->{fgcolor_tag} }, "default";
- undef %tested_colors;
-
- return $self;
+ undef $self->{text_style};
+ my %default = (tag => "default",
+ fgcolor => "#000000",
+ bgcolor => "#ffffff",
+ size => 3);
+ push @{ $self->{text_style} }, \%default;
}

sub get_results {
@@ -159,11 +171,9 @@
$self->{html}{"inside_$tag"} += $num;
$self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;

- if ($tag =~ /^(?:body|table|tr|th|td)$/) {
- $self->html_bgcolor($tag, $attr, $num);
- }
- if ($tag =~ /^(?:body|font)$/) {
- $self->html_fgcolor($tag, $attr, $num);
+ # TODO: cover other changes
+ if ($tag =~ /^(?:body|font|table|tr|th|td|big|small)$/) {
+ $self->text_style($tag, $attr, $num);
}

if ($num == 1) {
@@ -452,131 +462,184 @@
return $html_color{$_[0]} || $name_color{$_[0]} || $_[0];
}

-sub pop_bgcolor {
+# this might not be quite right, may need to pay attention to table nesting
+sub close_tag_tr {
my ($self) = @_;

- pop @{ $self->{bgcolor_color} };
- pop @{ $self->{bgcolor_tag} };
-}
-
-sub html_bgcolor {
- my ($self, $tag, $attr, $num) = @_;
+ # don't close if never opened
+ return if !grep { $_->{tag} eq "tr" } @{ $self->{text_style} };

- if ($num == 1) {
- # close elements with optional end tags
- if ($tag eq "body") {
- # compromise between HTML browsers generally only using first
- # body and some messages including multiple HTML attachments:
- # pop everything except first body color
- while ($self->{bgcolor_tag}[-1] !~ /^(?:default|body)$/) {
- $self->pop_bgcolor();
- }
- }
- if ($tag eq "tr") {
- while ($self->{bgcolor_tag}[-1] =~ /^t[hd]$/) {
- $self->pop_bgcolor();
- }
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "tr";
- }
- elsif ($tag =~ /^t[hd]$/) {
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] =~ /^t[hd]$/;
- }
- # figure out new bgcolor
- my $bgcolor;
- if (exists $attr->{bgcolor}) {
- $bgcolor = name_to_rgb(lc($attr->{bgcolor}));
+ my $tag;
+ while (@{ $self->{text_style} } && ($tag = $self->{text_style}[-1]->{tag})) {
+ if ($tag =~ /^(?:font|td|tr)$/) {
+ pop @{ $self->{text_style} };
}
else {
- $bgcolor = $self->{bgcolor_color}[-1];
+ last;
}
- # tests
- if ($tag eq "body" && $bgcolor !~ /^\#?ffffff$/) {
- $self->{html}{bgcolor_nonwhite} = 1;
- }
- # push new bgcolor
- push @{ $self->{bgcolor_color} }, $bgcolor;
- push @{ $self->{bgcolor_tag} }, $tag;
}
- else {
- # close elements
- if ($tag eq "body") {
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "body";
- }
- elsif ($tag eq "table") {
- while ($self->{bgcolor_tag}[-1] =~ /^t[rhd]$/) {
- $self->pop_bgcolor();
- }
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "table";
- }
- elsif ($tag eq "tr") {
- while ($self->{bgcolor_tag}[-1] =~ /^t[hd]$/) {
- $self->pop_bgcolor();
- }
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "tr";
+}
+
+# this might not be quite right, may need to pay attention to table nesting
+sub close_tag_td {
+ my ($self) = @_;
+
+ # don't close if never opened
+ return if !grep { $_->{tag} eq "td" } @{ $self->{text_style} };
+
+ my $tag;
+ while (@{ $self->{text_style} } && ($tag = $self->{text_style}[-1]->{tag})) {
+ if ($tag =~ /^(?:font|td)$/) {
+ pop @{ $self->{text_style} };
}
- elsif ($tag =~ /^t[hd]$/) {
- $self->pop_bgcolor() if $self->{bgcolor_tag}[-1] =~ /^t[hd]$/;
+ else {
+ last;
}
}
}

-sub pop_fgcolor {
- my ($self) = @_;
+sub close_tag {
+ my ($self, $tag) = @_;

- pop @{ $self->{fgcolor_color} };
- pop @{ $self->{fgcolor_tag} };
+ # don't close if never opened
+ return if !grep { $_->{tag} eq $tag } @{ $self->{text_style} };
+
+ # close everything up to and including tag
+ while (my %current = %{ pop @{ $self->{text_style} } }) {
+ last if $current{tag} eq $tag;
+ }
}

-sub html_fgcolor {
+# body, font, table, tr, th, td, big, small
+# TODO: implement <basefont> support
+sub text_style {
my ($self, $tag, $attr, $num) = @_;

+ # treat <th> as <td>
+ $tag = "td" if $tag eq "th";
+
+ # open
if ($num == 1) {
+ # HTML browsers generally only use first <body> for colors,
+ # so only push if we haven't seen a body tag yet
if ($tag eq "body") {
- # compromise between HTML browsers generally only using first
- # body and some messages including multiple HTML attachments:
- # pop everything except first body color
- while ($self->{fgcolor_tag}[-1] !~ /^(?:default|body)$/) {
- $self->pop_fgcolor();
+ # TODO: skip if we've already seen body
+ }
+
+ # close elements with optional end tags
+ $self->close_tag_tr() if $tag eq "tr";
+ $self->close_tag_td() if $tag eq "td";
+
+ # copy current text state
+ my %new = %{ $self->{text_style}[-1] };
+
+ # change tag name!
+ $new{tag} = $tag;
+
+ # big and small tags
+ if ($tag eq "big") {
+ $new{size} += 1;
+ push @{ $self->{text_style} }, \%new;
+ return;
+ }
+ if ($tag eq "small") {
+ $new{size} -= 1;
+ push @{ $self->{text_style} }, \%new;
+ return;
+ }
+
+ # tag attributes
+ for my $name (keys %$attr) {
+ next unless (grep { $_ eq $tag } @{ $ok_attribute{$name} });
+ if ($name =~ /^(?:text|color)$/) {
+ # two different names for text color
+ $new{fgcolor} = name_to_rgb(lc($attr->{$name}));
+ $self->html_font_color_tests($attr->{$name});
+ }
+ elsif ($name eq "size" && $attr->{size} =~ /^\s*([+-]\d+)/) {
+ # relative font size
+ $new{size} += $1;
+ }
+ else {
+ # overwrite
+ if ($name eq "bgcolor") {
+ $attr->{bgcolor} = name_to_rgb(lc($attr->{bgcolor}));
+ # one test
+ if ($tag eq "body" && $attr->{bgcolor} !~ /^\#?ffffff$/) {
+ $self->{html}{bgcolor_nonwhite} = 1;
+ }
+ }
+ if ($name eq "size" && $attr->{size} !~ /^\s*([+-])(\d+)/) {
+ # attribute is malformed
+ }
+ else {
+ # attribute is probably okay
+ $new{$name} = $attr->{$name};
+ }
}
}
- # figure out new fgcolor
- my $fgcolor;
- if ($tag eq "font" && exists $attr->{color}) {
- $fgcolor = name_to_rgb(lc($attr->{color}));
+ push @{ $self->{text_style} }, \%new;
+ }
+ # explicitly close a tag
+ else {
+ if ($tag ne "body") {
+ # don't close body since browsers seem to render text after </body>
+ $self->close_tag($tag);
}
- elsif ($tag eq "body" && exists $attr->{text}) {
- $fgcolor = name_to_rgb(lc($attr->{text}));
+ }
+}
+
+sub html_font_color_tests {
+ my ($self, $color) = @_;
+
+ my $bg = $self->{text_style}[-1]->{fgcolor};
+ my $fg = lc($color);
+
+ if ($fg =~ /^\#?[0-9a-f]{6}$/ && $fg !~ /^\#?(?:00|33|66|80|99|cc|ff){3}$/) {
+ $self->{html}{font_color_unsafe} = 1;
+ }
+ if ($fg !~ /^\#?[0-9a-f]{6}$/ && !exists $html_color{$fg}) {
+ $self->{html}{font_color_name} = 1;
+ }
+ if ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
+ my ($h, $s, $v) = rgb_to_hsv(hex($1), hex($2), hex($3));
+ if (!defined($h)) {
+ $self->{html}{font_gray} = 1 unless ($v == 0 || $v == 255);
}
- else {
- $fgcolor = $self->{fgcolor_color}[-1];
+ elsif ($h < 30 || $h >= 330) {
+ $self->{html}{font_red} = 1;
}
- # push new fgcolor
- push @{ $self->{fgcolor_color} }, $fgcolor;
- push @{ $self->{fgcolor_tag} }, $tag;
- }
- else {
- # close elements
- if ($tag eq "body") {
- $self->pop_fgcolor() if $self->{fgcolor_tag}[-1] eq "body";
+ elsif ($h < 90) {
+ $self->{html}{font_yellow} = 1;
+ }
+ elsif ($h < 150) {
+ $self->{html}{font_green} = 1;
+ }
+ elsif ($h < 210) {
+ $self->{html}{font_cyan} = 1;
}
- if ($tag eq "font") {
- $self->pop_fgcolor() if $self->{fgcolor_tag}[-1] eq "font";
+ elsif ($h < 270) {
+ $self->{html}{font_blue} = 1;
+ }
+ elsif ($h < 330) {
+ $self->{html}{font_magenta} = 1;
}
}
+ else {
+ $self->{html}{font_color_unknown} = 1;
+ }
}

sub html_font_invisible {
my ($self, $text) = @_;

- my $fg = $self->{fgcolor_color}[-1];
- my $bg = $self->{bgcolor_color}[-1];
-
- return if exists $tested_colors{"$fg\000$bg"};
- $tested_colors{"$fg\000$bg"}++;
+ my $fg = $self->{text_style}[-1]->{fgcolor};
+ my $bg = $self->{text_style}[-1]->{bgcolor};

# invisibility
if (substr($fg,-6) eq substr($bg,-6)) {
$self->{html}{font_invisible} = 1;
+ return 0;
}
# near-invisibility
elsif ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
@@ -602,6 +665,7 @@
}
}
}
+ return 1;
}

sub html_tests {
@@ -636,45 +700,6 @@
($size =~ /\-(\d+)/ && $1 >= 3));
$self->{html}{big_font} = 1 if (($size =~ /^\s*(\d+)/ && $1 > 3) ||
($size =~ /\+(\d+)/ && $1 >= 1));
- }
- if ($tag eq "font" && exists $attr->{color}) {
- my $bg = $self->{bgcolor_color}[-1];
- my $fg = lc($attr->{color});
- if ($fg =~ /^\#?[0-9a-f]{6}$/ && $fg !~ /^\#?(?:00|33|66|80|99|cc|ff){3}$/)
- {
- $self->{html}{font_color_unsafe} = 1;
- }
- if ($fg !~ /^\#?[0-9a-f]{6}$/ && !exists $html_color{$fg})
- {
- $self->{html}{font_color_name} = 1;
- }
- if ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
- my ($h, $s, $v) = rgb_to_hsv(hex($1), hex($2), hex($3));
- if (!defined($h)) {
- $self->{html}{font_gray} = 1 unless ($v == 0 || $v == 255);
- }
- elsif ($h < 30 || $h >= 330) {
- $self->{html}{font_red} = 1;
- }
- elsif ($h < 90) {
- $self->{html}{font_yellow} = 1;
- }
- elsif ($h < 150) {
- $self->{html}{font_green} = 1;
- }
- elsif ($h < 210) {
- $self->{html}{font_cyan} = 1;
- }
- elsif ($h < 270) {
- $self->{html}{font_blue} = 1;
- }
- elsif ($h < 330) {
- $self->{html}{font_magenta} = 1;
- }
- }
- else {
- $self->{html}{font_color_unknown} = 1;
- }
}
if ($tag eq "font" && exists $attr->{face}) {
#print STDERR "FONT " . $attr->{face} . "\n";