Mailing List Archive

svn commit: rev 6779 - in incubator/spamassassin/trunk/lib/Mail: . SpamAssassin
Author: jm
Date: Thu Feb 19 10:41:47 2004
New Revision: 6779

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
some cleanup of last night's metadata code

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm Thu Feb 19 10:41:47 2004
@@ -348,6 +348,11 @@
my $header = '';
$msg->{'pristine_headers'} = '';

+ # inform the node that it's a message root, so that it knows that
+ # it can have stuff that only root nodes have. TODO: IMO, we should
+ # probably just have a subclass of MsgContainer for root nodes!
+ $msg->_set_is_root();
+
# Go through all the headers of the message
while ( my $last = shift @message ) {
# Store the non-modified headers in a scalar

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Thu Feb 19 10:41:47 2004
@@ -2268,17 +2268,18 @@
}

# map of languages that are very often mistaken for another, perhaps with
- # more than 0.02% false positives
+ # more than 0.02% false positives. only used for text < 2048 bytes in
+ # length
my %mistakable = ('sco' => 'en');

# see if any matches are okay
foreach my $match (@matches) {
$match =~ s/\..*//;
- if (exists $mistakable{$match}) {
+ if ($self->{languages_body_len} < 2048 && exists $mistakable{$match}) {
$match = $mistakable{$match};
}
foreach my $language (@languages) {
- if (exists $mistakable{$language}) {
+ if ($self->{languages_body_len} < 2048 && exists $mistakable{$language}) {
$language = $mistakable{$language};
}
if ($match eq $language) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Thu Feb 19 10:41:47 2004
@@ -56,7 +56,6 @@
my $self = {
headers => {},
raw_headers => {},
- meta_strings => {},
body_parts => [],
header_order => [],
already_parsed => 1,
@@ -72,6 +71,23 @@
$self;
}

+=item _set_is_root()
+
+Non-Public function to inform this node that it's the root, and
+can hold stuff that only a root should do.
+
+(TODO: IMO, we should just have a subclass of MsgContainer for
+root nodes.)
+
+=cut
+
+sub _set_is_root {
+ my($self) = @_;
+
+ # create the metadata holder class
+ $self->{metadata} = Mail::SpamAssassin::MsgMetadata->new($self);
+}
+
=item _do_parse()

Non-Public function which will initiate a MIME part part (generates
@@ -589,11 +605,10 @@
sub extract_message_metadata {
my ($self, $main) = @_;

- # do this only once
+ # do this only once per message, it can be expensive
if ($self->{already_extracted_metadata}) { return; }
$self->{already_extracted_metadata} = 1;

- $self->{metadata} = Mail::SpamAssassin::MsgMetadata->new($self);
$self->{metadata}->extract ($self, $main);
}

@@ -605,7 +620,7 @@

sub get_metadata {
my ($self, $hdr) = @_;
- $self->{meta_strings}->{$hdr};
+ $self->{metadata}->{strings}->{$hdr};
}

=item put_metadata($hdr, $text)
@@ -614,7 +629,7 @@

sub put_metadata {
my ($self, $hdr, $text) = @_;
- $self->{meta_strings}->{$hdr} = $text;
+ $self->{metadata}->{strings}->{$hdr} = $text;
}

=item delete_metadata($hdr)
@@ -623,7 +638,7 @@

sub delete_metadata {
my ($self, $hdr) = @_;
- delete $self->{meta_strings}->{$hdr};
+ delete $self->{metadata}->{strings}->{$hdr};
}

=item $str = get_all_metadata()
@@ -634,13 +649,27 @@
my ($self) = @_;

my @ret = ();
- foreach my $key (sort keys %{$self->{meta_strings}}) {
- push (@ret, $key, ": ", $self->{meta_strings}->{$key}, "\n");
+ foreach my $key (sort keys %{$self->{metadata}->{strings}}) {
+ push (@ret, $key, ": ", $self->{metadata}->{strings}->{$key}, "\n");
}
return join ("", @ret);
}

# ---------------------------------------------------------------------------
+
+=item finish_metadata()
+
+Destroys the metadata for this message. Once a message has been
+scanned fully, the metadata is no longer required. Destroying
+this will free up some memory.
+
+=cut
+
+sub finish_metadata {
+ my ($self) = @_;
+ $self->{metadata}->finish();
+ delete $self->{metadata};
+}

=item finish()


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm Thu Feb 19 10:41:47 2004
@@ -24,7 +24,22 @@

=head1 DESCRIPTION

-This module will extract metadata from an email message.
+This class is tasked with extracting "metadata" from messages for use as
+Bayes tokens, fodder for eval tests, or other rules. Metadata is
+supplemental data inferred from the message, like the examples below.
+
+It is held in two forms:
+
+1. as name-value pairs of strings, presented in mail header format. For
+ example, "X-Language" => "en". This is the general form for simple
+ metadata that's useful as Bayes tokens, can be added to marked-up
+ messages using "add_header", etc., such as the trusted-relay inference
+ and language detection.
+
+2. as more complex data structures on the $msg->{metadata} object. This
+ is the form used for metadata like the HTML parse data, which is stored
+ there for access by eval rule code. Because it's not simple strings,
+ it's not added as a Bayes token by default (Bayes needs simple strings).

=head1 PUBLIC METHODS

@@ -48,9 +63,12 @@
sub new {
my ($class, $msg) = @_;
$class = ref($class) || $class;
+
my $self = {
- msg => $msg
+ msg => $msg,
+ strings => { }
};
+
bless($self,$class);
$self;
}
@@ -99,8 +117,12 @@
$body = join ("\n", @{$body});
$body =~ s/^Subject://i;

+ # note body text length, since the check_languages() eval rule also
+ # uses it
+ $self->{languages_body_len} = length($body);
+
# need about 256 bytes for reasonably accurate match (experimentally derived)
- if (length($body) < 256)
+ if ($self->{languages_body_len} < 256)
{
dbg("Message too short for language analysis");
return;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Thu Feb 19 10:41:47 2004
@@ -196,9 +196,9 @@
$self->{score} += $self->{learned_points};
}

+ # delete temporary storage and memory allocation used during checking
$self->delete_fulltext_tmpfile();

-
# Round the score to 3 decimal places to avoid rounding issues
# We assume required_score to be properly rounded already.
# add 0 to force it back to numeric representation instead of string.
@@ -221,6 +221,7 @@

$report =~ s/\n*$/\n\n/s;
$self->{report} = $report;
+ $self->{msg}->finish_metadata();

$self->{main}->call_plugins ("check_end", { permsgstatus => $self });
}