Mailing List Archive

svn commit: r434016 - in /spamassassin/branches/3.1: MANIFEST lib/Mail/SpamAssassin/AICache.pm lib/Mail/SpamAssassin/ArchiveIterator.pm
Author: jm
Date: Wed Aug 23 04:50:18 2006
New Revision: 434016

URL: http://svn.apache.org/viewvc?rev=434016&view=rev
Log:
bug 5060: backport ArchiveIterator changes for rule-QA system, namely: the AICache caching subsystem for faster mass-checks, support for combined --head/--tail semantics, more verbose 'showdots' to avoid buildbot timeouts, and a new intra-child IPC mechanism for multi-child mass-checks

Added:
spamassassin/branches/3.1/lib/Mail/SpamAssassin/AICache.pm
Modified:
spamassassin/branches/3.1/MANIFEST
spamassassin/branches/3.1/lib/Mail/SpamAssassin/ArchiveIterator.pm

Modified: spamassassin/branches/3.1/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/MANIFEST?rev=434016&r1=434015&r2=434016&view=diff
==============================================================================
--- spamassassin/branches/3.1/MANIFEST (original)
+++ spamassassin/branches/3.1/MANIFEST Wed Aug 23 04:50:18 2006
@@ -28,6 +28,7 @@
ldap/sa_test.ldif
lib/Mail/SpamAssassin.pm
lib/Mail/SpamAssassin/ArchiveIterator.pm
+lib/Mail/SpamAssassin/AICache.pm
lib/Mail/SpamAssassin/AutoWhitelist.pm
lib/Mail/SpamAssassin/Bayes.pm
lib/Mail/SpamAssassin/Bayes/CombineChi.pm

Added: spamassassin/branches/3.1/lib/Mail/SpamAssassin/AICache.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/AICache.pm?rev=434016&view=auto
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/AICache.pm (added)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/AICache.pm Wed Aug 23 04:50:18 2006
@@ -0,0 +1,179 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::AICache - provide access to cached information for
+ArchiveIterator
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This module allows ArchiveIterator to use cached atime information instead of
+having to read every message separately.
+
+=head1 PUBLIC METHODS
+
+=over 4
+
+=cut
+
+package Mail::SpamAssassin::AICache;
+
+use File::Spec;
+use File::Path;
+use File::Basename;
+
+use strict;
+use warnings;
+
+=item new()
+
+Generates a new cache object.
+
+=cut
+
+sub new {
+ my $class = shift;
+ $class = ref($class) || $class;
+
+ my $self = shift;
+ if (!defined $self) { $self = {}; }
+
+ $self->{cache} = {};
+ $self->{dirty} = 0;
+ $self->{prefix} ||= '/';
+
+ my $use_cache = 1;
+
+ # be sure to use rel2abs() here, since otherwise relative paths
+ # are broken by the prefix stuff
+ if ($self->{type} eq 'dir') {
+ $self->{cache_file} = File::Spec->catdir(
+ $self->{prefix},
+ File::Spec->rel2abs($self->{path}),
+ '.spamassassin_cache');
+
+ $self->{cache_mtime} = (stat($self->{cache_file}))[9] || 0;
+ }
+ else {
+ my @split = File::Spec->splitpath($self->{path});
+ $self->{cache_file} = File::Spec->catdir(
+ $self->{prefix},
+ File::Spec->rel2abs($split[1]),
+ join('_', '.spamassassin_cache', $self->{type}, $split[2]));
+
+ $self->{cache_mtime} = (stat($self->{cache_file}))[9] || 0;
+
+ # for mbox and mbx, verify whether mtime on cache file is >= mtime of
+ # messages file. if it is, use it, otherwise don't.
+ if ((stat($self->{path}))[9] > $self->{cache_mtime}) {
+ $use_cache = 0;
+ }
+ }
+ $self->{cache_file} = File::Spec->canonpath($self->{cache_file});
+
+ # go ahead and read in the cache information
+ if ($use_cache && open(CACHE, $self->{cache_file})) {
+ while(defined($_=<CACHE>)) {
+ my($k,$v) = split(/\t/, $_);
+ next unless (defined $k && defined $v);
+ $self->{cache}->{$k} = $v;
+ }
+ close(CACHE);
+ }
+
+ bless($self,$class);
+ $self;
+}
+
+sub count {
+ my ($self) = @_;
+ return keys %{$self->{cache}};
+}
+
+sub check {
+ my ($self, $name) = @_;
+
+ return $self->{cache} unless $name;
+
+ return if ($self->{type} eq 'dir' && (stat($name))[9] > $self->{cache_mtime});
+
+ $name = $self->canon($name);
+ return $self->{cache}->{$name};
+}
+
+sub update {
+ my ($self, $name, $date) = @_;
+
+ return unless $name;
+ $name = $self->canon($name);
+
+ # if information is different than cached version, set dirty and update
+ if (!exists $self->{cache}->{$name} || $self->{cache}->{$name} != $date) {
+ $self->{cache}->{$name} = $date;
+ $self->{dirty} = 1;
+ }
+}
+
+sub finish {
+ my ($self) = @_;
+
+ # Cache is dirty, so write out new file
+ if ($self->{dirty})
+ {
+ # create enclosing dir tree, if required
+ eval {
+ mkpath(dirname($self->{cache_file}));
+ };
+ if ($@) {
+ warn "Can't mkpath for AI cache file (".$self->{cache_file}."): $@ $!";
+ }
+
+ if (open(CACHE, ">" . $self->{cache_file})) {
+ while(my($k,$v) = each %{$self->{cache}}) {
+ print CACHE "$k\t$v\n";
+ }
+ close(CACHE);
+ }
+ else {
+ warn "Can't write AI cache file (".$self->{cache_file}."): $!";
+ }
+ }
+
+ return undef;
+}
+
+sub canon {
+ my ($self, $name) = @_;
+
+ if ($self->{type} eq 'dir') {
+ # strip off dirs, just look at filename
+ $name = (File::Spec->splitpath($name))[2];
+ }
+ else {
+ # we may get in a "/path/mbox.offset", so trim to just offset as necessary
+ $name =~ s/^.+\.(\d+)$/$1/;
+ }
+ return $name;
+}
+
+# ---------------------------------------------------------------------------
+
+1;
+__END__

Modified: spamassassin/branches/3.1/lib/Mail/SpamAssassin/ArchiveIterator.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/ArchiveIterator.pm?rev=434016&r1=434015&r2=434016&view=diff
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/ArchiveIterator.pm (original)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/ArchiveIterator.pm Wed Aug 23 04:50:18 2006
@@ -28,12 +28,14 @@
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::AICache;

use constant BIG_BYTES => 256*1024; # 256k is a big email
use constant BIG_LINES => BIG_BYTES/65; # 65 bytes/line is a good approximation

use vars qw {
$MESSAGES
+ $AICache
};

my @ISA = qw($MESSAGES);
@@ -46,9 +48,10 @@

my $iter = new Mail::SpamAssassin::ArchiveIterator(
{
- 'opt_j' => 0,
- 'opt_n' => 1,
- 'opt_all' => 1,
+ 'opt_j' => 0,
+ 'opt_n' => 1,
+ 'opt_all' => 1,
+ 'opt_cache' => 1,
}
);

@@ -141,6 +144,10 @@
Only use the last N ham and N spam (or if the value is -N, only use the last
N total messages regardless of class).

+If both C<opt_head> and C<opt_tail> are specified, then the C<opt_head> value
+specifies a subset of the C<opt_tail> selection to use; in other words, the
+C<opt_tail> splice is applied first.
+
=item opt_before

Only use messages which are received after the given time_t value.
@@ -155,7 +162,20 @@
=item opt_want_date

Set to 1 (default) if you want the received date to be filled in
-in the C<wanted_sub> callback below.
+in the C<wanted_sub> callback below. Set this to 0 to avoid this;
+it's a good idea to set this to 0 if you can, as it imposes a performance
+hit.
+
+=item opt_cache
+
+Set to 0 (default) if you don't want to use cached information to help speed
+up ArchiveIterator. Set to 1 to enable.
+
+=item opt_cachedir
+
+Set to the path of a directory where you wish to store cached information for
+C<opt_cache>, if you don't want to mix them with the input files (as is the
+default). The directory must be both readable and writable.

=item wanted_sub

@@ -178,6 +198,12 @@
Note that if C<opt_want_date> is set to 0, the received date scalar will be
undefined.

+=item scan_progress_sub
+
+Reference to a subroutine which will be called intermittently during
+the 'scan' phase of the mass-check. No guarantees are made as to
+how frequently this may happen, mind you.
+
=back

=cut
@@ -193,6 +219,7 @@
$self->{opt_head} = 0 unless (defined $self->{opt_head});
$self->{opt_tail} = 0 unless (defined $self->{opt_tail});
$self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
+ $self->{opt_cache} = 0 unless (defined $self->{opt_cache});

# If any of these options are set, we need to figure out the message's
# receive date at scan time. opt_n == 0, opt_after, opt_before
@@ -251,8 +278,9 @@
files are individual messages, C<file> a file with a single message,
C<mbox> an mbox formatted file, or C<mbx> for an mbx formatted directory.

-C<detect> can also be used; assumes C<file> for STDIN and anything that is not
-a directory, or C<directory> otherwise.
+C<detect> can also be used. This assumes C<mbox> for any file whose path
+contains the pattern C</\.mbox/i>, C<file> for STDIN and anything that is
+not a directory, or C<directory> otherwise.

=item raw_location

@@ -290,8 +318,9 @@
# forking model (generally mass-check), avoid extended memory usage
else {
my $tmpf;
- ($tmpf, $self->{messageh}) = Mail::SpamAssassin::Util::secure_tmpfile();
- unlink $tmpf;
+ ($tmpf, $self->{messageh}) = Mail::SpamAssassin::Util::secure_tmpfile()
+ or die 'archive-iterator: failed to create temp file';
+ unlink $tmpf or die "archive-iterator: unlink '$tmpf': $!";
undef $tmpf;

# forked child process scans messages
@@ -346,66 +375,53 @@
# feed childen, make them work for it, repeat
while ($select->count()) {
foreach my $socket ($select->can_read()) {
- my $result = '';
- my $line;
- while ($line = readline $socket) {
- if ($line =~ /^RESULT (.+)$/) {
- my ($date,$class,$type) = run_index_unpack($1);
- #warn ">> RESULT: $class, $type, $date\n";
-
- if (defined $self->{opt_restart} &&
- ($total_count % $self->{opt_restart}) == 0)
- {
- $needs_restart = 1;
- }
-
- # if messages remain, and we don't need to restart, send message
- if (($MESSAGES > $total_count) && !$needs_restart) {
- print { $socket } $self->next_message() . "\n";
- $total_count++;
- #warn ">> recv: $MESSAGES $total_count\n";
- }
- else {
- # stop listening on this child since we're done with it
- #warn ">> removeresult: $needs_restart $MESSAGES $total_count\n";
- $select->remove($socket);
- }
-
- # deal with the result we received
- if ($result) {
- chop $result; # need to chop the \n before RESULT
- &{$self->{result_sub}}($class, $result, $date);
- }
+ my $line = $self->read_line($socket);
+
+ # some error happened during the read!
+ if (!defined $line) {
+ $needs_restart = 1;
+ warn "archive-iterator: readline failed, attempting to recover\n";
+ $select->remove($socket);
+ }
+ elsif ($line =~ /^([^\0]+)\0RESULT (.+)$/s) {
+ my $result = $1;
+ my ($date,$class,$type) = index_unpack($2);
+ #warn ">> RESULT: $class, $type, $date\n";

- last; # this will avoid the read for this client
+ if (defined $self->{opt_restart} && ($total_count % $self->{opt_restart}) == 0) {
+ $needs_restart = 1;
}
- elsif ($line eq "START\n") {
- if ($MESSAGES > $total_count) {
- # we still have messages, send one to child
- print { $socket } $self->next_message() . "\n";
- $total_count++;
- #warn ">> new: $MESSAGES $total_count\n";
- }
- else {
- # no more messages, so stop listening on this child
- #warn ">> removestart: $needs_restart $MESSAGES $total_count\n";
- $select->remove($socket);
- }

- last; # this will avoid the read for this client
+ # if messages remain, and we don't need to restart, send message
+ if (($MESSAGES > $total_count) && !$needs_restart) {
+ $self->send_line($socket, $self->next_message());
+ $total_count++;
+ #warn ">> recv: $MESSAGES $total_count\n";
}
else {
- # result line, remember it
- $result .= $line;
+ # stop listening on this child since we're done with it
+ #warn ">> removeresult: $needs_restart $MESSAGES $total_count\n";
+ $select->remove($socket);
}
- }

- # some error happened during the read!
- if (!defined $line || !$line) {
- $needs_restart = 1;
- warn "archive-iterator: readline failed, attempting to recover\n";
- $select->remove($socket);
- }
+ # deal with the result we received
+ if ($result) {
+ &{$self->{result_sub}}($class, $result, $date);
+ }
+ }
+ elsif ($line eq "START") {
+ if ($MESSAGES > $total_count) {
+ # we still have messages, send one to child
+ $self->send_line($socket, $self->next_message());
+ $total_count++;
+ #warn ">> new: $MESSAGES $total_count\n";
+ }
+ else {
+ # no more messages, so stop listening on this child
+ #warn ">> removestart: $needs_restart $MESSAGES $total_count\n";
+ $select->remove($socket);
+ }
+ }
}

#warn ">> out of loop, $MESSAGES $total_count $needs_restart ".$select->count()."\n";
@@ -442,7 +458,7 @@
sub run_message {
my ($self, $msg) = @_;

- my ($date, $class, $format, $mail) = run_index_unpack($msg);
+ my ($date, $class, $format, $mail) = index_unpack($msg);

if ($format eq 'f') {
return $self->run_file($class, $format, $mail, $date);
@@ -470,18 +486,17 @@
return;
}
my @msg;
- my $header = '';
+ my $header;
while (<INPUT>) {
- if (!$header && /^\s*$/) {
- $header = join('', @msg);
- }
-
push(@msg, $_);
+ if (!defined $header && /^\s*$/) {
+ $header = $#msg;
+ }
}
close INPUT;

if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
- $date = Mail::SpamAssassin::Util::receive_date($header);
+ $date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
}

return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg, $format));
@@ -492,20 +507,15 @@

my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
my @msg;
- my $header = '';
+ my $header;
if (!mail_open($file)) {
$self->{access_problem} = 1;
return;
}
seek(INPUT,$offset,0);
- my $past = 0;
while (<INPUT>) {
- if ($past) {
- last if substr($_,0,5) eq "From ";
- }
- else {
- $past = 1;
- }
+ last if (substr($_,0,5) eq "From " && @msg);
+ push (@msg, $_);

# skip too-big mails
if (! $self->{opt_all} && @msg > BIG_LINES) {
@@ -514,16 +524,14 @@
return;
}

- if (!$header && /^\s*$/) {
- $header = join('', @msg);
+ if (!defined $header && /^\s*$/) {
+ $header = $#msg;
}
-
- push (@msg, $_);
}
close INPUT;

- if ($date == AI_TIME_UNKNOWN) {
- $date = Mail::SpamAssassin::Util::receive_date($header);
+ if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
+ $date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
}

return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg, $format));
@@ -534,17 +542,19 @@

my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
my @msg;
- my $header = '';
+ my $header;

if (!mail_open($file)) {
$self->{access_problem} = 1;
return;
}
+
seek(INPUT, $offset, 0);

while (<INPUT>) {
last if ($_ =~ MBX_SEPARATOR);
-
+ push (@msg, $_);
+
# skip mails that are too big
if (! $self->{opt_all} && @msg > BIG_LINES) {
info("archive-iterator: skipping large message\n");
@@ -552,16 +562,14 @@
return;
}

- if (!$header && /^\s*$/) {
- $header = join('', @msg);
+ if (!defined $header && /^\s*$/) {
+ $header = $#msg;
}
-
- push (@msg, $_);
}
close INPUT;

- if ($date == AI_TIME_UNKNOWN) {
- $date = Mail::SpamAssassin::Util::receive_date($header);
+ if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
+ $date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
}

return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg, $format));
@@ -573,8 +581,7 @@

sub next_message {
my ($self) = @_;
- my $line = readline $self->{messageh};
- chomp $line if defined $line;
+ my $line = $self->read_line($self->{messageh});
return $line;
}

@@ -613,11 +620,9 @@
close $child->[$i];
select($parent);
$| = 1; # print to parent by default, turn off buffering
- print "START\n";
- while ($line = readline $parent) {
- chomp $line;
+ $self->send_line($parent,"START");
+ while ($line = $self->read_line($parent)) {
if ($line eq "exit") {
- print "END\n";
close $parent;
exit;
}
@@ -630,10 +635,10 @@
# the packed version if possible ... use defined for date since
# it could == 0.
if (!$self->{determine_receive_date} && $class && $format && defined $date && $where) {
- $line = run_index_pack($date, $class, $format, $where);
+ $line = index_pack($date, $class, $format, $where);
}

- print "$result\nRESULT $line\n";
+ $self->send_line($parent,"$result\0RESULT $line");
}
exit;
}
@@ -656,8 +661,7 @@

for (my $i = 0; $i < $count; $i++) {
#warn "debug: killing child $i (pid ",$pid->[$i],")\n";
- print { $socket->[$i] } "exit\n"; # tell the child to die.
- my $line = readline $socket->[$i]; # read its END statement.
+ $self->send_line($socket->[$i],"exit"); # tell the child to die.
close $socket->[$i];
waitpid($pid->[$i], 0); # wait for the signal ...
}
@@ -665,17 +669,34 @@

############################################################################

-# 0 850852128 atime
-# 1 h class
-# 2 m format
-# 3 ./ham/goodmsgs.0 path
+# four bytes in network/vax format (little endian) as length of message
+# the rest is the actual message

-sub run_index_pack {
- return join("\000", @_);
+sub read_line {
+ my($self, $fd) = @_;
+ my($length,$msg);
+
+ # read in the 4 byte length and unpack
+ sysread($fd, $length, 4);
+ $length = unpack("V", $length);
+# warn "<< $$ $length\n";
+ return unless $length;
+
+ # read in the rest of the single message
+ sysread($fd, $msg, $length);
+# warn "<< $$ $msg\n";
+ return $msg;
}

-sub run_index_unpack {
- return split(/\000/, $_[0]);
+sub send_line {
+ my $self = shift;
+ my $fd = shift;
+
+ foreach ( @_ ) {
+ my $length = pack("V", length $_);
+# warn ">> $$ ".length($_)." $_\n";
+ syswrite($fd, $length . $_);
+ }
}

############################################################################
@@ -716,9 +737,13 @@

if ($format eq 'detect') {
# detect the format
- if ($location eq '-' || !(-d $location)) {
+ if (!-d $location && $location =~ /\.mbox/i) {
+ # filename indicates mbox
+ $method = \&scan_mailbox;
+ }
+ elsif ($location eq '-' || !(-d $location)) {
# stdin is considered a file if not passed as mbox
- $method = \&scan_file;
+ $method = \&scan_file;
}
else {
# it's a directory
@@ -754,14 +779,14 @@
# OPT_N == 1 means don't bother sorting on message receive date

# head or tail > 0 means crop each list
- if ($self->{opt_head} > 0) {
- splice(@{$self->{s}}, $self->{opt_head});
- splice(@{$self->{h}}, $self->{opt_head});
- }
if ($self->{opt_tail} > 0) {
splice(@{$self->{s}}, 0, -$self->{opt_tail});
splice(@{$self->{h}}, 0, -$self->{opt_tail});
}
+ if ($self->{opt_head} > 0) {
+ splice(@{$self->{s}}, min ($self->{opt_head}, scalar @{$self->{s}}));
+ splice(@{$self->{h}}, min ($self->{opt_head}, scalar @{$self->{h}}));
+ }

@messages = ( @{$self->{s}}, @{$self->{h}} );
undef $self->{s};
@@ -777,14 +802,14 @@
undef $self->{h};

# head or tail > 0 means crop each list
- if ($self->{opt_head} > 0) {
- splice(@s, $self->{opt_head});
- splice(@h, $self->{opt_head});
- }
if ($self->{opt_tail} > 0) {
splice(@s, 0, -$self->{opt_tail});
splice(@h, 0, -$self->{opt_tail});
}
+ if ($self->{opt_head} > 0) {
+ splice(@s, min ($self->{opt_head}, scalar @s));
+ splice(@h, min ($self->{opt_head}, scalar @h));
+ }

# interleave ordered spam and ham
if (@s && @h) {
@@ -798,23 +823,16 @@
}

# head or tail < 0 means crop the total list, negate the value appropriately
- if ($self->{opt_head} < 0) {
- splice(@messages, -$self->{opt_head});
- }
if ($self->{opt_tail} < 0) {
splice(@messages, 0, $self->{opt_tail});
}
-
- # Convert scan index format to run index format
- # TODO: figure out a better scan index format which doesn't include newlines
- # so readline() works (or replace readline with something else ...?)
- foreach (@messages) {
- $_ = run_index_pack(scan_index_unpack($_));
+ if ($self->{opt_head} < 0) {
+ splice(@messages, -$self->{opt_head});
}

# Dump out the messages to the temp file if we're using one
if (defined $fh) {
- print { $fh } map { "$_\n" } scalar(@messages), @messages;
+ $self->send_line($fh, scalar(@messages), @messages);
return;
}

@@ -870,11 +888,11 @@

# put the date in first, big-endian packed format
# this format lets cmp easily sort by date, then class, format, and path.
-sub scan_index_pack {
+sub index_pack {
return pack("NAAA*", @_);
}

-sub scan_index_unpack {
+sub index_unpack {
return unpack("NAAA*", $_[0]);
}

@@ -900,35 +918,52 @@
@files = grep { -f } map { "$folder/$_" } @files;

if (!@files) {
- warn "archive-iterator: readdir found no mail in '$folder' directory\n";
+ # this is not a problem; no need to warn about it
+ # warn "archive-iterator: readdir found no mail in '$folder' directory\n";
return;
}

+ $self->create_cache('dir', $folder);
+
foreach my $mail (@files) {
$self->scan_file($class, $mail);
}
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
+ }
}

sub scan_file {
my ($self, $class, $mail) = @_;

+ $self->bump_scan_progress();
if (!$self->{determine_receive_date}) {
- push(@{$self->{$class}}, scan_index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
+ push(@{$self->{$class}}, index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
return;
}
- my $header;
- if (!mail_open($mail)) {
- $self->{access_problem} = 1;
- return;
- }
- while (<INPUT>) {
- last if /^\s*$/;
- $header .= $_;
+
+ my $date;
+
+ unless (defined $AICache and $date = $AICache->check($mail)) {
+ my $header;
+ if (!mail_open($mail)) {
+ $self->{access_problem} = 1;
+ return;
+ }
+ while (<INPUT>) {
+ last if /^\s*$/;
+ $header .= $_;
+ }
+ close(INPUT);
+ $date = Mail::SpamAssassin::Util::receive_date($header);
+ if (defined $AICache) {
+ $AICache->update($mail, $date);
+ }
}
- close(INPUT);
- my $date = Mail::SpamAssassin::Util::receive_date($header);
+
return if !$self->message_is_useful_by_date($date);
- push(@{$self->{$class}}, scan_index_pack($date, $class, "f", $mail));
+ push(@{$self->{$class}}, index_pack($date, $class, "f", $mail));
}

sub scan_mailbox {
@@ -943,6 +978,7 @@
$self->{access_problem} = 1;
return;
}
+
while ($_ = readdir(DIR)) {
if(/^[^\.]\S*$/ && ! -d "$folder/$_") {
push(@files, "$folder/$_");
@@ -955,54 +991,79 @@
}

foreach my $file (@files) {
+ $self->bump_scan_progress();
if ($file =~ /\.(?:gz|bz2)$/) {
warn "archive-iterator: compressed mbox folders are not supported at this time\n";
$self->{access_problem} = 1;
next;
}

- if (!mail_open($file)) {
- $self->{access_problem} = 1;
- next;
+ my $info = {};
+ my $count;
+
+ $self->create_cache('mbox', $file);
+
+ if ($self->{opt_cache}) {
+ if ($count = $AICache->count()) {
+ $info = $AICache->check();
+ }
}
-
- my $start = 0; # start of a message
- my $where = 0; # current byte offset
- my $first = ''; # first line of message
- my $header = ''; # header text
- my $in_header = 0; # are in we a header?
- while (!eof INPUT) {
- my $offset = $start; # byte offset of this message
- my $header = $first; # remember first line
- while (<INPUT>) {
- if ($in_header) {
- if (/^\s*$/) {
- $in_header = 0;
+
+ unless ($count) {
+ if (!mail_open($file)) {
+ $self->{access_problem} = 1;
+ next;
+ }
+
+ my $start = 0; # start of a message
+ my $where = 0; # current byte offset
+ my $first = ''; # first line of message
+ my $header = ''; # header text
+ my $in_header = 0; # are in we a header?
+ while (!eof INPUT) {
+ my $offset = $start; # byte offset of this message
+ my $header = $first; # remember first line
+ while (<INPUT>) {
+ if ($in_header) {
+ if (/^$/) {
+ $in_header = 0;
+ }
+ else {
+ $header .= $_;
+ }
}
- else {
- $header .= $_;
+ if (substr($_,0,5) eq "From ") {
+ $in_header = 1;
+ $first = $_;
+ $start = $where;
+ $where = tell INPUT;
+ last;
}
- }
- if (substr($_,0,5) eq "From ") {
- $in_header = 1;
- $first = $_;
- $start = $where;
$where = tell INPUT;
- last;
+ }
+ if ($header) {
+ $self->bump_scan_progress();
+ $info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
}
- $where = tell INPUT;
}
- if ($header) {
- my $date = Mail::SpamAssassin::Util::receive_date($header);
+ close INPUT;
+ }

- if ($self->{determine_receive_date}) {
- next if !$self->message_is_useful_by_date($date);
- }
+ while(my($k,$v) = each %{$info}) {
+ if (defined $AICache && !$count) {
+ $AICache->update($k, $v);
+ }

- push(@{$self->{$class}}, scan_index_pack($date, $class, "m", "$file.$offset"));
+ if ($self->{determine_receive_date}) {
+ next if !$self->message_is_useful_by_date($v);
}
+
+ push(@{$self->{$class}}, index_pack($v, $class, "m", "$file.$k"));
+ }
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
}
- close INPUT;
}
}

@@ -1018,6 +1079,7 @@
$self->{access_problem} = 1;
return;
}
+
while ($_ = readdir(DIR)) {
if(/^[^\.]\S*$/ && ! -d "$folder/$_") {
push(@files, "$folder/$_");
@@ -1030,54 +1092,92 @@
}

foreach my $file (@files) {
+ $self->bump_scan_progress();
+
if ($folder =~ /\.(?:gz|bz2)$/) {
warn "archive-iterator: compressed mbx folders are not supported at this time\n";
$self->{access_problem} = 1;
next;
}
- if (!mail_open($file)) {
- $self->{access_problem} = 1;
- next;
- }

- # check the mailbox is in mbx format
- $fp = <INPUT>;
- if ($fp !~ /\*mbx\*/) {
- die "archive-iterator: error: mailbox not in mbx format!\n";
+ my $info = {};
+ my $count;
+
+ $self->create_cache('mbx', $file);
+
+ if ($self->{opt_cache}) {
+ if ($count = $AICache->count()) {
+ $info = $AICache->check();
+ }
}

- # skip mbx headers to the first email...
- seek(INPUT, 2048, 0);
+ unless ($count) {
+ if (!mail_open($file)) {
+ $self->{access_problem} = 1;
+ next;
+ }

- my $sep = MBX_SEPARATOR;
+ # check the mailbox is in mbx format
+ $fp = <INPUT>;
+ if ($fp !~ /\*mbx\*/) {
+ die "archive-iterator: error: mailbox not in mbx format!\n";
+ }

- while (<INPUT>) {
- if ($_ =~ /$sep/) {
- my $offset = tell INPUT;
- my $size = $2;
-
- # gather up the headers...
- my $header = '';
- while (<INPUT>) {
- last if (/^\s*$/);
- $header .= $_;
- }
+ # skip mbx headers to the first email...
+ seek(INPUT, 2048, 0);

- my $date = Mail::SpamAssassin::Util::receive_date($header);
+ my $sep = MBX_SEPARATOR;

- if ($self->{determine_receive_date}) {
- next if !$self->message_is_useful_by_date($date);
- }
+ while (<INPUT>) {
+ if ($_ =~ /$sep/) {
+ my $offset = tell INPUT;
+ my $size = $2;
+
+ # gather up the headers...
+ my $header = '';
+ while (<INPUT>) {
+ last if (/^\s*$/);
+ $header .= $_;
+ }

- push(@{$self->{$class}}, scan_index_pack($date, $class, "b", "$file.$offset"));
+ $self->bump_scan_progress();
+ $info->{"$file.$offset"} = Mail::SpamAssassin::Util::receive_date($header);

- seek(INPUT, $offset + $size, 0);
+ # go onto the next message
+ seek(INPUT, $offset + $size, 0);
+ }
+ else {
+ die "archive-iterator: error: failure to read message body!\n";
+ }
}
- else {
- die "archive-iterator: error: failure to read message body!\n";
+ close INPUT;
+ }
+
+ while(my($k,$v) = each %{$info}) {
+ if (defined $AICache && !$count) {
+ $AICache->update($k, $v);
+ }
+
+ if ($self->{determine_receive_date}) {
+ next if !$self->message_is_useful_by_date($v);
}
+
+ push(@{$self->{$class}}, index_pack($v, $class, "b", "$file.$k"));
}
- close INPUT;
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
+ }
+ }
+}
+
+############################################################################
+
+sub bump_scan_progress {
+ my ($self) = @_;
+ if (exists $self->{scan_progress_sub}) {
+ return unless ($self->{scan_progress_counter}++ % 50 == 0);
+ $self->{scan_progress_sub}->();
}
}

@@ -1116,6 +1216,22 @@
}
}

+sub min {
+ return ($_[0] < $_[1] ? $_[0] : $_[1]);
+}
+
+sub create_cache {
+ my ($self, $type, $path) = @_;
+
+ if ($self->{opt_cache}) {
+ $AICache = Mail::SpamAssassin::AICache->new({
+ 'type' => $type,
+ 'prefix' => $self->{opt_cachedir},
+ 'path' => $path,
+ });
+ }
+}
+
############################################################################

1;
@@ -1129,3 +1245,7 @@
C<Mail::SpamAssassin>
C<spamassassin>
C<mass-check>
+
+=cut
+
+# vim: ts=8 sw=2 et