Mailing List Archive

svn commit: rev 6713 - in incubator/spamassassin/trunk: . lib/Mail lib/Mail/SpamAssassin masses spamd t
Author: felicity
Date: Tue Feb 17 18:41:34 2004
New Revision: 6713

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
incubator/spamassassin/trunk/masses/mass-check
incubator/spamassassin/trunk/spamassassin.raw
incubator/spamassassin/trunk/spamd/spamd.raw
incubator/spamassassin/trunk/t/bayesdbm.t
incubator/spamassassin/trunk/t/bayessql.t
incubator/spamassassin/trunk/t/mimeparse.t
incubator/spamassassin/trunk/t/rule_tests.t
Log:
API CHANGE! Moving from MsgParser->parse() to M::SA->parse(), so future changes to how we parse do not mean a change in the backend. Everything else is the same for now.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm Tue Feb 17 18:41:34 2004
@@ -20,9 +20,8 @@

=head1 SYNOPSIS

- my $mail = Mail::SpamAssassin::MsgParser->parse();
-
my $spamtest = Mail::SpamAssassin->new();
+ my $mail = $spamtest->parse();
my $status = $spamtest->check ($mail);

if ($status->is_spam ()) {
@@ -301,6 +300,100 @@

###########################################################################

+=item parse()
+
+Parse will return a Mail::SpamAssassin::MsgContainer object. To use it,
+simply call C<Mail::SpamAssassin->parse($msg)>, where $msg is either undef
+(will use STDIN), a scalar of the entire message, an array reference
+of the message with 1 line per array element, or a file glob with the
+entire contents of the message.
+
+The procedure used to parse a message is recursive and ends up
+generating a tree of M::SA::MsgContainer objects. parse() will generate
+the parent node of the tree, then pass the body of the message to
+M::SA::MsgParser->parse_body() which begins the recursive process.
+
+=cut
+
+sub parse {
+ my($self, $message) = @_;
+ $message ||= \*STDIN;
+
+ dbg("---- MIME PARSER START ----");
+
+ # protect it from abuse ...
+ local $_;
+
+ my @message;
+ if (ref $message eq 'ARRAY') {
+ @message = @{$message};
+ }
+ elsif (ref $message eq 'GLOB') {
+ if (defined fileno $message) {
+ @message = <$message>;
+ }
+ }
+ else {
+ @message = split ( /^/m, $message );
+ }
+
+ # Generate the main object and parse the appropriate MIME-related headers into it.
+ my $msg = Mail::SpamAssassin::MsgContainer->new();
+ my $header = '';
+ $msg->{'pristine_headers'} = '';
+
+ # Go through all the headers of the message
+ while ( my $last = shift @message ) {
+ # Store the non-modified headers in a scalar
+ $msg->{'pristine_headers'} .= $last;
+
+ if ( $last =~ /^From\s/ ) {
+ $msg->{'mbox_sep'} = $last;
+ next;
+ }
+
+ # NB: Really need to figure out special folding rules here!
+ if ( $last =~ /^[ \t]+/ ) { # if its a continuation
+ $header .= $last; # fold continuations
+ next;
+ }
+
+ # Ok, there's a header here, let's go ahead and add it in.
+ if ($header) {
+ my ( $key, $value ) = split ( /:\s*/, $header, 2 );
+ $msg->header( $key, $value );
+ }
+
+ # not a continuation...
+ $header = $last;
+
+ # Ok, we found the header/body blank line ...
+ last if ( $last =~ /^\r?$/m );
+ }
+
+ # Store the pristine body for later -- store as a copy since @message will get modified below
+ $msg->{'pristine_body'} = join('', @message);
+
+ # CRLF -> LF
+ for ( @message ) {
+ s/\r\n/\n/;
+ }
+
+ # Figure out the boundary
+ my ($boundary);
+ ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+ dbg("main message type: ".$msg->{'type'});
+
+ # Make the tree
+ Mail::SpamAssassin::MsgParser->parse_body( $msg, $msg, $boundary, \@message, 1 );
+
+ dbg("---- MIME PARSER END ----");
+
+ return $msg;
+}
+
+###########################################################################
+
=item $f->trim_rules ($regexp)

Remove all rules that don't match the given regexp (or are sub-rules of
@@ -600,8 +693,7 @@

sub check_message_text {
my $self = shift;
- my @lines = split (/^/m, $_[0]);
- my $mail_obj = Mail::SpamAssassin::MsgParser->parse (\@lines);
+ my $mail_obj = $self->parse (shift);
return $self->check ($mail_obj);
}

@@ -646,8 +738,7 @@
$self->init(1);

# Let's make sure the markup was removed first ...
- my @msg = split (/^/m, $self->remove_spamassassin_markup($mail));
- $mail = Mail::SpamAssassin::MsgParser->parse (\@msg);
+ $mail = $self->parse ($self->remove_spamassassin_markup($mail));

# learn as spam if enabled
if ( $self->{conf}->{bayes_learn_during_report} ) {
@@ -690,8 +781,7 @@
$self->init(1);

# Let's make sure the markup was removed first ...
- my @msg = split (/^/m, $self->remove_spamassassin_markup($mail));
- $mail = Mail::SpamAssassin::MsgParser->parse (\@msg);
+ $mail = $self->parse ($self->remove_spamassassin_markup($mail));

# learn as nonspam
$self->learn ($mail, undef, 0, 0);
@@ -1106,7 +1196,7 @@
dbg ("ignore: test message to precompile patterns and load modules");
$self->init($use_user_prefs);

- my $mail = Mail::SpamAssassin::MsgParser->parse(\@testmsg);
+ my $mail = $self->parse(\@testmsg);
my $status = Mail::SpamAssassin::PerMsgStatus->new($self, $mail,
{ disable_auto_learning => 1 } );
$status->word_is_in_dictionary("aba"); # load triplets.txt into memory
@@ -1153,7 +1243,7 @@
$self->init(1);
$self->{syntax_errors} += $self->{conf}->{errors};

- my $mail = Mail::SpamAssassin::MsgParser->parse(\@testmsg);
+ my $mail = $self->parse(\@testmsg);
my $status = Mail::SpamAssassin::PerMsgStatus->new($self, $mail,
{ disable_auto_learning => 1 } );
$status->check();

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm Tue Feb 17 18:41:34 2004
@@ -21,7 +21,7 @@

use Mail::SpamAssassin;
use Mail::SpamAssassin::ArchiveIterator;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
use Mail::SpamAssassin::PerMsgLearner;

use Getopt::Long;
@@ -317,13 +317,13 @@
{ die 'HITLIMIT'; }

$messagecount++;
- my $ma = Mail::SpamAssassin::MsgParser->parse ($dataref);
+ my $ma = Mail::SpamAssassin->parse ($dataref);

if ($ma->get_header ("X-Spam-Checker-Version")) {
my $newtext = $spamtest->remove_spamassassin_markup($ma);
my @newtext = split (/^/m, $newtext);
$dataref = \@newtext;
- $ma = Mail::SpamAssassin::MsgParser->parse ($dataref);
+ $ma = Mail::SpamAssassin->parse ($dataref);
}

my $status = $spamtest->learn ($ma, undef, $isspam, $forget);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm Tue Feb 17 18:41:34 2004
@@ -43,117 +43,22 @@

use constant MAX_BODY_LINE_LENGTH => 2048;

-=item parse()
+=item parse_body()

-Unlike most modules, Mail::SpamAssassin::MsgParser will not return
-an object of the same type, but rather a Mail::SpamAssassin::MsgContainer
-object. To use it, simply call
-C<Mail::SpamAssassin::MsgParser->parse($msg)>, where $msg is either
-a scalar, an array reference, or a glob, with the entire contents
-of the mesage.
-
-The procedure used to parse a message is recursive and ends up generating
-a tree of M::SA::MsgContainer objects. parse() will generate the parent node
-of the tree, then pass the body of the message to _parse_body() which begins
-the recursive process.
-
-=cut
-
-sub parse {
- my($self,$message) = @_;
- $message ||= \*STDIN;
-
- dbg("---- MIME PARSER START ----");
-
- # protect it from abuse ...
- local $_;
-
- my @message;
- if (ref $message eq 'ARRAY') {
- @message = @{$message};
- }
- elsif (ref $message eq 'GLOB') {
- if (defined fileno $message) {
- @message = <$message>;
- }
- }
- else {
- @message = split ( /^/m, $message );
- }
-
- # Generate the main object and parse the appropriate MIME-related headers into it.
- my $msg = Mail::SpamAssassin::MsgContainer->new();
- my $header = '';
- $msg->{'pristine_headers'} = '';
-
- # Go through all the headers of the message
- while ( my $last = shift @message ) {
- # Store the non-modified headers in a scalar
- $msg->{'pristine_headers'} .= $last;
-
- if ( $last =~ /^From\s/ ) {
- $msg->{'mbox_sep'} = $last;
- next;
- }
-
- # NB: Really need to figure out special folding rules here!
- if ( $last =~ /^[ \t]+/ ) { # if its a continuation
- $header .= $last; # fold continuations
- next;
- }
-
- # Ok, there's a header here, let's go ahead and add it in.
- if ($header) {
- my ( $key, $value ) = split ( /:\s*/, $header, 2 );
- $msg->header( $key, $value );
- }
-
- # not a continuation...
- $header = $last;
-
- # Ok, we found the header/body blank line ...
- last if ( $last =~ /^\r?$/m );
- }
-
- # Store the pristine body for later -- store as a copy since @message will get modified below
- $msg->{'pristine_body'} = join('', @message);
-
- # CRLF -> LF
- for ( @message ) {
- s/\r\n/\n/;
- }
-
- # Figure out the boundary
- my ($boundary);
- ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
- dbg("main message type: ".$msg->{'type'});
-
- # Make the tree
- $self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
-
- dbg("---- MIME PARSER END ----");
-
- return $msg;
-}
-
-=head1 NON-PUBLIC METHODS
-
-=item _parse_body()
-
-_parse_body() passes the body part that was passed in onto the
+parse_body() passes the body part that was passed in onto the
correct part parser, either _parse_multipart() for multipart/* parts,
or _parse_normal() for everything else. Multipart sections become the
root of sub-trees, while everything else becomes a leaf in the tree.

-For multipart messages, the first call to _parse_body() doesn't create a
+For multipart messages, the first call to parse_body() doesn't create a
new sub-tree and just uses the parent node to contain children. All other
-calls to _parse_body() will cause a new sub-tree root to be created and
+calls to parse_body() will cause a new sub-tree root to be created and
children will exist underneath that root. (this is just so the tree
doesn't have a root node which points at the actual root node ...)

=cut

-sub _parse_body {
+sub parse_body {
my($self, $msg, $_msg, $boundary, $body, $initial) = @_;

# Figure out the simple content-type, or set it to text/plain
@@ -182,9 +87,11 @@
}
}

+=head1 NON-PUBLIC METHODS
+
=item _parse_multipart()

-Generate a root node, and for each child part call _parse_body()
+Generate a root node, and for each child part call parse_body()
to generate the tree.

=cut
@@ -237,7 +144,7 @@
($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
$p_boundary ||= $boundary;
dbg("found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
- $self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
+ $self->parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
}

last if (defined $boundary && $line =~ /^\-\-\Q${boundary}\E\-\-$/);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm Tue Feb 17 18:41:34 2004
@@ -24,7 +24,7 @@
'rules_filename' => '/etc/spamassassin.rules',
'userprefs_filename' => $ENV{HOME}.'/.spamassassin.cf'
});
- my $mail = Mail::SpamAssassin::MsgParser->parse();
+ my $mail = $spamtest->parse();

my $status = $spamtest->learn ($mail);
...

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue Feb 17 18:41:34 2004
@@ -24,7 +24,7 @@
'rules_filename' => '/etc/spamassassin.rules',
'userprefs_filename' => $ENV{HOME}.'/.spamassassin.cf'
});
- my $mail = Mail::SpamAssassin::MsgParser->parse();
+ my $mail = $spamtest->parse();

my $status = $spamtest->check ($mail);
if ($status->is_spam()) {
@@ -57,7 +57,7 @@
use Mail::SpamAssassin::Conf;
use Mail::SpamAssassin::Received;
use Mail::SpamAssassin::Util;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;

use constant MAX_BODY_LINE_LENGTH => 2048;


Modified: incubator/spamassassin/trunk/masses/mass-check
==============================================================================
--- incubator/spamassassin/trunk/masses/mass-check (original)
+++ incubator/spamassassin/trunk/masses/mass-check Tue Feb 17 18:41:34 2004
@@ -77,7 +77,7 @@
eval "use bytes";
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
use Getopt::Long;
use POSIX qw(strftime);
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
@@ -242,12 +242,12 @@
my ($id, $time, $dataref) = @_;
my $out;

- my $ma = Mail::SpamAssassin::MsgParser->parse($dataref);
+ my $ma = $spamtest->parse($dataref);

# remove SpamAssassin markup, if present and the mail was spam
$_ = $ma->get_header ("X-Spam-Status");
if (defined($_) && /^Yes, hits=/) {
- $ma = Mail::SpamAssassin::MsgParser->parse ($spamtest->remove_spamassassin_markup($ma));
+ $ma = $spamtest->parse ($spamtest->remove_spamassassin_markup($ma));
}

my $status = $spamtest->check($ma);

Modified: incubator/spamassassin/trunk/spamassassin.raw
==============================================================================
--- incubator/spamassassin/trunk/spamassassin.raw (original)
+++ incubator/spamassassin/trunk/spamassassin.raw Tue Feb 17 18:41:34 2004
@@ -80,7 +80,7 @@

eval {
require Mail::SpamAssassin;
- require Mail::SpamAssassin::MsgParser;
+ require Mail::SpamAssassin::MsgContainer;

# gnu_getopt is not available in Getopt::Long 2.24, see bug 732
# gnu_compat neither.
@@ -168,7 +168,7 @@
# incoming message
#
if (!$doing_address_only_whitelisting) {
- $mail = Mail::SpamAssassin::MsgParser->parse ();
+ $mail = $spamtest->parse ();
}

# handle removing reports
@@ -183,7 +183,7 @@
# go ahead and remove the markup, then fake that the clean version
# was what was sent in
#
- $mail = Mail::SpamAssassin::MsgParser->parse ($spamtest->remove_spamassassin_markup ($mail));
+ $mail = $spamtest->parse ($spamtest->remove_spamassassin_markup ($mail));
}
}


Modified: incubator/spamassassin/trunk/spamd/spamd.raw
==============================================================================
--- incubator/spamassassin/trunk/spamd/spamd.raw (original)
+++ incubator/spamassassin/trunk/spamd/spamd.raw Tue Feb 17 18:41:34 2004
@@ -38,7 +38,7 @@
use IO::Pipe;

use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
use Mail::SpamAssassin::NetSet;

use Getopt::Long;
@@ -888,7 +888,7 @@
"."
);

- my $mail = Mail::SpamAssassin::MsgParser->parse (\@msglines);
+ my $mail = $spamtest->parse (\@msglines);

# Check length if we're supposed to
if($expected_length && ($actual_length != $expected_length)) {

Modified: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm.t (original)
+++ incubator/spamassassin/trunk/t/bayesdbm.t Tue Feb 17 18:41:34 2004
@@ -26,7 +26,7 @@
");

use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;

my $sa = create_saobj();

@@ -54,7 +54,7 @@
push(@msg, $line);
}

-my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+my $mail = $sa->parse( \@msg );

ok($mail);

@@ -215,7 +215,7 @@
push(@msg, $line);
}

-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );

$body = $sa->{bayes_scanner}->get_body_from_msg($mail);

@@ -246,7 +246,7 @@
push(@msg, $line);
}

-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );

$body = $sa->{bayes_scanner}->get_body_from_msg($mail);


Modified: incubator/spamassassin/trunk/t/bayessql.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayessql.t (original)
+++ incubator/spamassassin/trunk/t/bayessql.t Tue Feb 17 18:41:34 2004
@@ -59,7 +59,7 @@
");

use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;

my $sa = create_saobj();

@@ -87,7 +87,7 @@
push(@msg, $line);
}

-my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+my $mail = $sa->parse( \@msg );

ok($mail);

@@ -220,7 +220,7 @@
push(@msg, $line);
}

-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );

$body = $sa->{bayes_scanner}->get_body_from_msg($mail);

@@ -250,7 +250,7 @@
push(@msg, $line);
}

-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );

$body = $sa->{bayes_scanner}->get_body_from_msg($mail);


Modified: incubator/spamassassin/trunk/t/mimeparse.t
==============================================================================
--- incubator/spamassassin/trunk/t/mimeparse.t (original)
+++ incubator/spamassassin/trunk/t/mimeparse.t Tue Feb 17 18:41:34 2004
@@ -17,7 +17,7 @@

use strict;
use Test;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
use Mail::SpamAssassin::SHA1;

my %files = (
@@ -88,7 +88,7 @@

foreach my $k ( sort keys %files ) {
open(INP, $k) || die "Can't find $k:$!";
- my $mail = Mail::SpamAssassin::MsgParser->parse(\*INP);
+ my $mail = Mail::SpamAssassin->parse(\*INP);
close(INP);
my $res = join("\n",$mail->content_summary());
#print "---\n$res\n---\n";

Modified: incubator/spamassassin/trunk/t/rule_tests.t
==============================================================================
--- incubator/spamassassin/trunk/t/rule_tests.t (original)
+++ incubator/spamassassin/trunk/t/rule_tests.t Tue Feb 17 18:41:34 2004
@@ -18,7 +18,7 @@
use strict;
use Test;
use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
use vars qw($num_tests);

$num_tests = 1;
@@ -62,7 +62,7 @@
my $test_string = $sa->{conf}->{head_tests}->{$symbol} || $sa->{conf}->{head_evals}->{$symbol};
my ($header_name) = $test_string =~ /^(\S+)/;
# warn("got header name: $header_name - setting to: $string\n");
- $mail = Mail::SpamAssassin::MsgParser->parse(["${header_name}: $string\n","\n","\n"]);
+ $mail = $sa->parse(["${header_name}: $string\n","\n","\n"]);
}
else {
# warn("setting body: $string\n");
@@ -73,7 +73,7 @@
if ( $string =~ /<[^>]*>/ ) {
$type = "text/html";
}
- $mail = Mail::SpamAssassin::MsgParser->parse(["Content-type: $type\n","\n","$string\n"]);
+ $mail = $sa->parse(["Content-type: $type\n","\n","$string\n"]);
}

my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);