Mailing List Archive

svn commit: rev 6716 - in incubator/spamassassin/trunk: lib/Mail lib/Mail/SpamAssassin t
Author: felicity
Date: Tue Feb 17 22:47:19 2004
New Revision: 6716

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
incubator/spamassassin/trunk/t/mimeparse.t
Log:
make M::SA::parse() not do the full recursive mime tree generation. let M::SA::MsgContainer::find_parts() do it as necessary. this makes check() run at about the same speed, but things like 'spamassassin -d' should run a little faster due to the reduced amount of work that needs to occur.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm Tue Feb 17 22:47:19 2004
@@ -308,22 +308,28 @@
of the message with 1 line per array element, or a file glob with the
entire contents of the message.

-The procedure used to parse a message is recursive and ends up
-generating a tree of M::SA::MsgContainer objects. parse() will generate
-the parent node of the tree, then pass the body of the message to
-M::SA::MsgParser->parse_body() which begins the recursive process.
+This function will return a base MsgContainer object with just the headers
+being parsed. M::SA::MsgContainer->find_parts() will end up doing a
+full recursive mime parse of the message as necessary. That procedure is
+recursive and ends up generating a tree of M::SA::MsgContainer objects.
+parse() will generate the parent node of the tree, then pass the body of
+the message to M::SA::MsgParser->parse_body() which begins the recursive
+process.

=cut

+# NOTE: This function is allowed (in bad OO form) to modify the
+# MsgContainer object directly as MsgContainer doesn't really have a
+# constructor in the traditional OO way of things.
+
sub parse {
my($self, $message) = @_;
$message ||= \*STDIN;

- dbg("---- MIME PARSER START ----");
-
# protect it from abuse ...
local $_;

+ # Figure out how the message was passed to us, and deal with it.
my @message;
if (ref $message eq 'ARRAY') {
@message = @{$message};
@@ -338,7 +344,7 @@
}

# Generate the main object and parse the appropriate MIME-related headers into it.
- my $msg = Mail::SpamAssassin::MsgContainer->new();
+ my $msg = Mail::SpamAssassin::MsgContainer->new(already_parsed => 0);
my $header = '';
$msg->{'pristine_headers'} = '';

@@ -371,7 +377,8 @@
last if ( $last =~ /^\r?$/m );
}

- # Store the pristine body for later -- store as a copy since @message will get modified below
+ # Store the pristine body for later -- store as a copy since @message
+ # will get modified below
$msg->{'pristine_body'} = join('', @message);

# CRLF -> LF
@@ -379,15 +386,11 @@
s/\r\n/\n/;
}

- # Figure out the boundary
- my ($boundary);
- ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
- dbg("main message type: ".$msg->{'type'});
-
- # Make the tree
- Mail::SpamAssassin::MsgParser->parse_body( $msg, $msg, $boundary, \@message, 1 );
-
- dbg("---- MIME PARSER END ----");
+ # If the message does need to get parsed, save off a copy of the body
+ # in a format we can easily parse later so we don't have to rip from
+ # pristine_body ...
+ #
+ $msg->{'toparse'} = \@message;

return $msg;
}

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Tue Feb 17 22:47:19 2004
@@ -50,7 +50,7 @@
sub new {
my $class = shift;
$class = ref($class) || $class;
-# my %opts = @_;
+ my %opts = @_;

my $self = {
headers => {},
@@ -58,18 +58,52 @@
metadata => {},
body_parts => [],
header_order => [],
+ already_parsed => 1,
};

-# # allow callers to set certain options ...
-# foreach ( 'noexit' ) {
-# $self->{$_} = $opts{$_} if ( exists $opts{$_} );
-# }
+ # allow callers to set certain options ...
+ foreach ( 'already_parsed' ) {
+ $self->{$_} = $opts{$_} if ( exists $opts{$_} );
+ }

bless($self,$class);

$self;
}

+=item _do_parse()
+
+Non-Public function which will initiate a MIME part part (generates
+a tree) of the current message. Typically called by find_parts()
+as necessary.
+
+=cut
+
+sub _do_parse {
+ my($self) = @_;
+
+ # If we're called when we don't need to be, then just go ahead and return.
+ return if ($self->{'already_parsed'});
+
+ my $toparse = $self->{'toparse'};
+ delete $self->{'toparse'};
+
+ dbg("---- MIME PARSER START ----");
+
+ # Figure out the boundary
+ my ($boundary);
+ ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
+ dbg("main message type: ".$self->{'type'});
+
+ # Make the tree
+ Mail::SpamAssassin::MsgParser->parse_body( $self, $self, $boundary, $toparse, 1 );
+ $self->{'already_parsed'} = 1;
+
+ dbg("---- MIME PARSER END ----");
+
+ return;
+}
+
=item find_parts()

=cut
@@ -86,10 +120,23 @@

$onlyleaves = 0 unless defined $onlyleaves;
$recursive = 1 unless defined $recursive;
+
+ # ok, we need to do the parsing now...
+ $self->_do_parse() if (!$self->{'already_parsed'});
+
+ return $self->_find_parts($re, $onlyleaves, $recursive);
+}
+
+# We have 2 functions in find_parts() to optimize out the penalty of
+# 'already_parsed'... It also lets us avoid checking $onlyleaves, $re,
+# and $recursive over and over again.
+#
+sub _find_parts {
+ my ($self, $re, $onlyleaves, $recursive) = @_;
my @ret = ();

# If this object matches, mark it for return.
- my $amialeaf = !exists $self->{'body_parts'};
+ my $amialeaf = $self->is_leaf();

if ( $self->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) {
push(@ret, $self);
@@ -99,7 +146,7 @@
# This object is a subtree root. Search all children.
foreach my $parts ( @{$self->{'body_parts'}} ) {
# Add the recursive results to our results
- push(@ret, $parts->find_parts($re));
+ push(@ret, $parts->_find_parts($re));
}
}

@@ -183,10 +230,15 @@

=item is_leaf()

+Returns true if the tree node in question is a leaf of the tree (ie:
+has no children of its own). Note: This function may return odd results
+unless the message has been mime parsed via _do_parse()!
+
=cut

sub is_leaf {
- return exists $_[0]->{'raw'};
+ my($self) = @_;
+ return !exists $self->{'body_parts'};
}

=item raw()
@@ -333,6 +385,10 @@
}

=item content_summary()
+
+Returns an array of scalars describing the mime parts of the message.
+Note: This function requires that the message be parsed first via
+_do_parse()!

=cut


Modified: incubator/spamassassin/trunk/t/mimeparse.t
==============================================================================
--- incubator/spamassassin/trunk/t/mimeparse.t (original)
+++ incubator/spamassassin/trunk/t/mimeparse.t Tue Feb 17 22:47:19 2004
@@ -90,6 +90,10 @@
open(INP, $k) || die "Can't find $k:$!";
my $mail = Mail::SpamAssassin->parse(\*INP);
close(INP);
+
+ # We know it's not parsed, so deal with it!
+ $mail->_do_parse();
+
my $res = join("\n",$mail->content_summary());
#print "---\n$res\n---\n";
ok( $res eq shift @{$files{$k}} );