Mailing List Archive

svn commit: r440453 - in /spamassassin/trunk/lib/Mail: SpamAssassin.pm SpamAssassin/Message.pm SpamAssassin/Message/Node.pm SpamAssassin/Plugin/BodyEval.pm SpamAssassin/Util.pm
Author: felicity
Date: Tue Sep 5 12:30:37 2006
New Revision: 440453

URL: http://svn.apache.org/viewvc?view=rev&rev=440453
Log:
bug 5069: add in post_message_parse plugin call, and support for having non-text parts with rendered text. this will allow a plugin for OCR to put the rendered text into the part and have it used automatically in body tests. also, handle certain non-standard content-types, limit internal rendering to just text/plain and text/html.

Modified:
spamassassin/trunk/lib/Mail/SpamAssassin.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin.pm?view=diff&rev=440453&r1=440452&r2=440453
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin.pm Tue Sep 5 12:30:37 2006
@@ -396,6 +396,12 @@
my($self, $message, $parsenow) = @_;
$self->init(1);
my $msg = Mail::SpamAssassin::Message->new({message=>$message, parsenow=>$parsenow, normalize=>$self->{conf}->{normalize_charset}});
+
+ # bug 5069: The goal here is to get rendering plugins to do things
+ # like OCR, convert doc and pdf to text, etc, though it could be anything
+ # that wants to process the message after it's been parsed.
+ $self->call_plugins("post_message_parse", { message => $msg });
+
return $msg;
}


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?view=diff&rev=440453&r1=440452&r2=440453
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Tue Sep 5 12:30:37 2006
@@ -821,7 +821,7 @@
$self->{text_rendered} = [];

# Find all parts which are leaves
- my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
+ my @parts = $self->find_parts(qr/./,1);
return $self->{text_rendered} unless @parts;

# the html metadata may have already been set, so let's not bother if it's
@@ -833,10 +833,6 @@
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
my $p = $parts[$pt];

- # bug 4843: skip text/calendar parts since they're usually an attachment
- # and not displayed
- next if ($p->{'type'} eq 'text/calendar');
-
# put a blank line between parts ...
$text .= "\n";

@@ -853,9 +849,6 @@
$self->{metadata}->{html} = $p->{html_results};
}
}
- else {
- $text .= $p->decode();
- }
}

# whitespace handling (warning: small changes have large effects!)
@@ -885,7 +878,7 @@
$self->{text_visible_rendered} = [];

# Find all parts which are leaves
- my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
+ my @parts = $self->find_parts(qr/./,1);
return $self->{text_visible_rendered} unless @parts;

# the html metadata may have already been set, so let's not bother if it's
@@ -897,10 +890,6 @@
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
my $p = $parts[$pt];

- # bug 4843: skip text/calendar parts since they're usually an attachment
- # and not displayed
- next if ($p->{'type'} eq 'text/calendar');
-
# put a blank line between parts ...
$text .= "\n";

@@ -917,9 +906,6 @@
$self->{metadata}->{html} = $p->{html_results};
}
}
- else {
- $text .= $p->decode();
- }
}

# whitespace handling (warning: small changes have large effects!)
@@ -943,7 +929,7 @@
$self->{text_invisible_rendered} = [];

# Find all parts which are leaves
- my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
+ my @parts = $self->find_parts(qr/./,1);
return $self->{text_invisible_rendered} unless @parts;

# the html metadata may have already been set, so let's not bother if it's
@@ -954,10 +940,6 @@
my $text = '';
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
my $p = $parts[$pt];
-
- # bug 4843: skip text/calendar parts since they're usually an attachment
- # and not displayed
- next if ($p->{'type'} eq 'text/calendar');

# put a blank line between parts ...
$text .= "\n" if ( $text );

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm?view=diff&rev=440453&r1=440452&r2=440453
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm Tue Sep 5 12:30:37 2006
@@ -381,17 +381,20 @@
sub rendered {
my ($self) = @_;

- # We don't render anything except text
- return(undef,undef) unless ( $self->{'type'} =~ /^text\b/i );
-
if (!exists $self->{rendered}) {
+ # We only know how to render text/plain and text/html ...
+ # Note: for bug 4843, make sure to skip text/calendar parts
+ # we also want to skip things like text/x-vcard
+ # text/x-aol is ignored here, but looks like text/html ...
+ return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
+
my $text = $self->_normalize($self->decode(), $self->{charset});
my $raw = length($text);

# render text/html always, or any other text|text/plain part as text/html
# based on a heuristic which simulates a certain common mail client
- if ($raw > 0 && ($self->{'type'} =~ m@^text/html\b@i ||
- ($self->{'type'} =~ m@^text(?:$|/plain)@i &&
+ if ($raw > 0 && ($self->{'type'} =~ m@^text/html$@i ||
+ ($self->{'type'} =~ m@^text/plain$@i &&
_html_render(substr($text, 0, 23)))))
{
$self->{rendered_type} = 'text/html';
@@ -416,8 +419,16 @@
}
else {
$self->{rendered_type} = $self->{type};
- $self->{rendered} = $self->{visible_rendered} = $text;
+ $self->{rendered} = $text;
}
+ }
+
+ # If these weren't set by anything else, go ahead and set them now...
+ if (!exists $self->{'visible_rendered'}) {
+ $self->{'visible_rendered'} = $self->{'rendered'};
+ }
+ if (!exists $self->{'invisible_rendered'}) {
+ $self->{'invisible_rendered'} = '';
}

return ($self->{rendered_type}, $self->{rendered});

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm?view=diff&rev=440453&r1=440452&r2=440453
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm Tue Sep 5 12:30:37 2006
@@ -133,6 +133,7 @@
foreach my $text (@txt) {
# we only care about the rendered version of the part
my ($type, $rnd) = $text->rendered();
+ next unless defined $type;

# parse the rendered text into tokens. assume they are whitespace
# separated, and ignore anything that doesn't have a word-character

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?view=diff&rev=440453&r1=440452&r2=440453
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Tue Sep 5 12:30:37 2006
@@ -794,16 +794,23 @@
# Get the actual MIME type out ...
# Note: the header content may not be whitespace unfolded, so make sure the
# REs do /s when appropriate.
+ # correct:
+ # Content-type: text/plain; charset=us-ascii
+ # missing a semi-colon, CT shouldn't have whitespace anyway:
+ # Content-type: text/plain charset=us-ascii
#
- $ct =~ s/^\s+//; # strip leading whitespace
- $ct =~ s/;.*$//s; # strip everything after first ';'
- $ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@s; # only something/something ...
+ $ct =~ s/^\s+//; # strip leading whitespace
+ $ct =~ s/;.*$//s; # strip everything after first ';'
+ $ct =~ s@^([^/]+(?:/[^/\s]*)?).*$@$1@s; # only something/something ...
# strip inappropriate chars
$ct =~ tr/\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135//d;
$ct = lc $ct;

# bug 4298: If at this point we don't have a content-type, assume text/plain
- $ct ||= "text/plain";
+ # also, if the content-type is simply "text" or "text/", assume text/plain
+ if (!$ct || $ct =~ /^text\/?$/) {
+ $ct = "text/plain";
+ }

# Now that the header has been parsed, return the requested information.
# In scalar context, just the MIME type, in array context the