Mailing List Archive

svn commit: r325998 - in /spamassassin/trunk: lib/Mail/SpamAssassin/AICache.pm lib/Mail/SpamAssassin/ArchiveIterator.pm masses/mass-check
Author: felicity
Date: Mon Oct 17 16:17:21 2005
New Revision: 325998

URL: http://svn.apache.org/viewcvs?rev=325998&view=rev
Log:
bug 4534: implemented a --cache option in mass-check so that checks can be done faster without recalculating the message's atime each time through

Added:
spamassassin/trunk/lib/Mail/SpamAssassin/AICache.pm
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
spamassassin/trunk/masses/mass-check

Added: spamassassin/trunk/lib/Mail/SpamAssassin/AICache.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/AICache.pm?rev=325998&view=auto
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/AICache.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/AICache.pm Mon Oct 17 16:17:21 2005
@@ -0,0 +1,157 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::AICache - provide access to cached information for
+ArchiveIterator
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This module allows ArchiveIterator to use cached atime information instead of
+having to read every message separately.
+
+=head1 PUBLIC METHODS
+
+=over 4
+
+=cut
+
+package Mail::SpamAssassin::AICache;
+
+use File::Spec;
+
+use strict;
+use warnings;
+
+=item new()
+
+Generates a new cache object.
+
+=cut
+
+sub new {
+ my $class = shift;
+ $class = ref($class) || $class;
+
+ my $self = shift;
+ if (!defined $self) { $self = {}; }
+
+ $self->{cache} = {};
+ $self->{dirty} = 0;
+
+ my $use_cache = 1;
+
+ if ($self->{type} eq 'dir') {
+ $self->{cache_file} = File::Spec->catdir($self->{path}, '.spamassassin_cache');
+ $self->{cache_mtime} = (stat($self->{cache_file}))[9] || 0;
+ }
+ else {
+ my @split = File::Spec->splitpath($self->{path});
+ $self->{cache_file} = File::Spec->catdir($split[1], join('_',
+ '.spamassassin_cache', $self->{type}, $split[2]));
+ $self->{cache_mtime} = (stat($self->{cache_file}))[9] || 0;
+
+ # for mbox and mbx, verify whether mtime on cache file is >= mtime of
+ # messages file. if it is, use it, otherwise don't.
+ if ((stat($self->{path}))[9] > $self->{cache_mtime}) {
+ $use_cache = 0;
+ }
+ }
+ $self->{cache_file} = File::Spec->canonpath($self->{cache_file});
+
+ # go ahead and read in the cache information
+ if ($use_cache && open(CACHE, $self->{cache_file})) {
+ while(defined($_=<CACHE>)) {
+ my($k,$v) = split(/\t/, $_);
+ next unless (defined $k && defined $v);
+ $self->{cache}->{$k} = $v;
+ }
+ close(CACHE);
+ }
+
+ bless($self,$class);
+ $self;
+}
+
+sub count {
+ my ($self) = @_;
+ return keys %{$self->{cache}};
+}
+
+sub check {
+ my ($self, $name) = @_;
+
+ return $self->{cache} unless $name;
+
+ return if ($self->{type} eq 'dir' && (stat($name))[9] > $self->{cache_mtime});
+
+ $name = $self->canon($name);
+ return $self->{cache}->{$name};
+}
+
+sub update {
+ my ($self, $name, $date) = @_;
+
+ return unless $name;
+ $name = $self->canon($name);
+
+ # if information is different than cached version, set dirty and update
+ if (!exists $self->{cache}->{$name} || $self->{cache}->{$name} != $date) {
+ $self->{cache}->{$name} = $date;
+ $self->{dirty} = 1;
+ }
+}
+
+sub finish {
+ my ($self) = @_;
+
+ # Cache is dirty, so write out new file
+ if ($self->{dirty}) {
+ if (open(CACHE, ">" . $self->{cache_file})) {
+ while(my($k,$v) = each %{$self->{cache}}) {
+ print CACHE "$k\t$v\n";
+ }
+ close(CACHE);
+ }
+ else {
+ warn "Can't write AI cache file (".$self->{cache_file}."): $!";
+ }
+ }
+
+ return undef;
+}
+
+sub canon {
+ my ($self, $name) = @_;
+
+ if ($self->{type} eq 'dir') {
+ # strip off dirs, just look at filename
+ $name = (File::Spec->splitpath($name))[2];
+ }
+ else {
+ # we may get in a "/path/mbox.offset", so trim to just offset as necessary
+ $name =~ s/^.+\.(\d+)$/$1/;
+ }
+ return $name;
+}
+
+# ---------------------------------------------------------------------------
+
+1;
+__END__

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm?rev=325998&r1=325997&r2=325998&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm Mon Oct 17 16:17:21 2005
@@ -27,12 +27,14 @@
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::AICache;

use constant BIG_BYTES => 256*1024; # 256k is a big email
use constant BIG_LINES => BIG_BYTES/65; # 65 bytes/line is a good approximation

use vars qw {
$MESSAGES
+ $AICache
};

my @ISA = qw($MESSAGES);
@@ -45,9 +47,10 @@

my $iter = new Mail::SpamAssassin::ArchiveIterator(
{
- 'opt_j' => 0,
- 'opt_n' => 1,
- 'opt_all' => 1,
+ 'opt_j' => 0,
+ 'opt_n' => 1,
+ 'opt_all' => 1,
+ 'opt_cache' => 1,
}
);

@@ -158,6 +161,11 @@
it's a good idea to set this to 0 if you can, as it imposes a performance
hit.

+=item opt_cache
+
+Set to 0 (default) if you don't want to use cached information to help speed
+up ArchiveIterator. Set to 1 to enable.
+
=item wanted_sub

Reference to a subroutine which will process message data. Usually
@@ -194,6 +202,7 @@
$self->{opt_head} = 0 unless (defined $self->{opt_head});
$self->{opt_tail} = 0 unless (defined $self->{opt_tail});
$self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
+ $self->{opt_cache} = 0 unless (defined $self->{opt_cache});

# If any of these options are set, we need to figure out the message's
# receive date at scan time. opt_n == 0, opt_after, opt_before
@@ -887,9 +896,19 @@
return;
}

+ if ($self->{opt_cache}) {
+ $AICache = Mail::SpamAssassin::AICache->new({ 'type' => 'dir',
+ 'path' => $folder,
+ });
+ }
+
foreach my $mail (@files) {
$self->scan_file($class, $mail);
}
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
+ }
}

sub scan_file {
@@ -899,14 +918,23 @@
push(@{$self->{$class}}, scan_index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
return;
}
- my $header;
- mail_open($mail) or return;
- while (<INPUT>) {
- last if /^\s*$/;
- $header .= $_;
+
+ my $date;
+
+ unless (defined $AICache and $date = $AICache->check($mail)) {
+ my $header;
+ mail_open($mail) or return;
+ while (<INPUT>) {
+ last if /^\s*$/;
+ $header .= $_;
+ }
+ close(INPUT);
+ $date = Mail::SpamAssassin::Util::receive_date($header);
+ if (defined $AICache) {
+ $AICache->update($mail, $date);
+ }
}
- close(INPUT);
- my $date = Mail::SpamAssassin::Util::receive_date($header);
+
return if !$self->message_is_useful_by_date($date);
push(@{$self->{$class}}, scan_index_pack($date, $class, "f", $mail));
}
@@ -935,45 +963,69 @@
die "archive-iterator: compressed mbox folders are not supported at this time\n";
}

- mail_open($file) or return;
+ my $info = {};
+ my $count;
+
+ if ($self->{opt_cache}) {
+ $AICache = Mail::SpamAssassin::AICache->new({ 'type' => 'mbox',
+ 'path' => $file,
+ });
+ if ($count = $AICache->count()) {
+ $info = $AICache->check();
+ }
+ }
+
+ unless ($count) {
+ mail_open($file) or return;

- my $start = 0; # start of a message
- my $where = 0; # current byte offset
- my $first = ''; # first line of message
- my $header = ''; # header text
- my $in_header = 0; # are in we a header?
- while (!eof INPUT) {
- my $offset = $start; # byte offset of this message
- my $header = $first; # remember first line
- while (<INPUT>) {
- if ($in_header) {
- if (/^\s*$/) {
- $in_header = 0;
+ my $start = 0; # start of a message
+ my $where = 0; # current byte offset
+ my $first = ''; # first line of message
+ my $header = ''; # header text
+ my $in_header = 0; # are in we a header?
+ while (!eof INPUT) {
+ my $offset = $start; # byte offset of this message
+ my $header = $first; # remember first line
+ while (<INPUT>) {
+ if ($in_header) {
+ if (/^\s*$/) {
+ $in_header = 0;
+ }
+ else {
+ $header .= $_;
+ }
}
- else {
- $header .= $_;
+ if (substr($_,0,5) eq "From ") {
+ $in_header = 1;
+ $first = $_;
+ $start = $where;
+ $where = tell INPUT;
+ last;
}
- }
- if (substr($_,0,5) eq "From ") {
- $in_header = 1;
- $first = $_;
- $start = $where;
$where = tell INPUT;
- last;
+ }
+ if ($header) {
+ $info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
}
- $where = tell INPUT;
}
- if ($header) {
- my $date = Mail::SpamAssassin::Util::receive_date($header);
+ close INPUT;
+ }

- if ($self->{determine_receive_date}) {
- next if !$self->message_is_useful_by_date($date);
- }
+ while(my($k,$v) = each %{$info}) {
+ if (defined $AICache && !$count) {
+ $AICache->update($k, $v);
+ }

- push(@{$self->{$class}}, scan_index_pack($date, $class, "m", "$file.$offset"));
+ if ($self->{determine_receive_date}) {
+ next if !$self->message_is_useful_by_date($v);
}
+
+ push(@{$self->{$class}}, scan_index_pack($v, $class, "m", "$file.$k"));
+ }
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
}
- close INPUT;
}
}

@@ -1000,46 +1052,72 @@
if ($folder =~ /\.(?:gz|bz2)$/) {
die "archive-iterator: compressed mbx folders are not supported at this time\n";
}
- mail_open($file) or return;

- # check the mailbox is in mbx format
- $fp = <INPUT>;
- if ($fp !~ /\*mbx\*/) {
- die "archive-iterator: error: mailbox not in mbx format!\n";
+ my $info = {};
+ my $count;
+
+ if ($self->{opt_cache}) {
+ $AICache = Mail::SpamAssassin::AICache->new({ 'type' => 'mbx',
+ 'path' => $file,
+ });
+ if ($count = $AICache->count()) {
+ $info = $AICache->check();
+ }
}

- # skip mbx headers to the first email...
- seek(INPUT, 2048, 0);
+ unless ($count) {
+ mail_open($file) or return;

- my $sep = MBX_SEPARATOR;
+ # check the mailbox is in mbx format
+ $fp = <INPUT>;
+ if ($fp !~ /\*mbx\*/) {
+ die "archive-iterator: error: mailbox not in mbx format!\n";
+ }

- while (<INPUT>) {
- if ($_ =~ /$sep/) {
- my $offset = tell INPUT;
- my $size = $2;
-
- # gather up the headers...
- my $header = '';
- while (<INPUT>) {
- last if (/^\s*$/);
- $header .= $_;
- }
+ # skip mbx headers to the first email...
+ seek(INPUT, 2048, 0);

- my $date = Mail::SpamAssassin::Util::receive_date($header);
+ my $sep = MBX_SEPARATOR;

- if ($self->{determine_receive_date}) {
- next if !$self->message_is_useful_by_date($date);
- }
+ while (<INPUT>) {
+ if ($_ =~ /$sep/) {
+ my $offset = tell INPUT;
+ my $size = $2;
+
+ # gather up the headers...
+ my $header = '';
+ while (<INPUT>) {
+ last if (/^\s*$/);
+ $header .= $_;
+ }

- push(@{$self->{$class}}, scan_index_pack($date, $class, "b", "$file.$offset"));
+ $info->{"$file.$offset"} = Mail::SpamAssassin::Util::receive_date($header);

- seek(INPUT, $offset + $size, 0);
+ # go onto the next message
+ seek(INPUT, $offset + $size, 0);
+ }
+ else {
+ die "archive-iterator: error: failure to read message body!\n";
+ }
}
- else {
- die "archive-iterator: error: failure to read message body!\n";
+ close INPUT;
+ }
+
+ while(my($k,$v) = each %{$info}) {
+ if (defined $AICache && !$count) {
+ $AICache->update($k, $v);
}
+
+ if ($self->{determine_receive_date}) {
+ next if !$self->message_is_useful_by_date($v);
+ }
+
+ push(@{$self->{$class}}, scan_index_pack($v, $class, "b", "$file.$k"));
+ }
+
+ if (defined $AICache) {
+ $AICache = $AICache->finish();
}
- close INPUT;
}
}


Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/mass-check?rev=325998&r1=325997&r2=325998&view=diff
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Mon Oct 17 16:17:21 2005
@@ -51,6 +51,7 @@

message selection options
-n no date sorting or spam/ham interleaving
+ --cache use cache information when selecting messages
--after=N only test mails received after time_t N (negative values
are an offset from current time, e.g. -86400 = last day)
or after date as parsed by Time::ParseDate (e.g. '-6 months')
@@ -89,7 +90,7 @@
$opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots
$opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris
$opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
- $opt_learn $opt_reuse $opt_lint
+ $opt_learn $opt_reuse $opt_lint $opt_cache
$total_messages $statusevery
%reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);

@@ -120,7 +121,7 @@
"hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
"progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
"rules=s", "restart=i", "after=s", "before=s", "loguris",
- "deencap=s", "logmem", "learn=i", "reuse", "lint",
+ "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
"dir" => sub { $opt_format = "dir"; },
"file" => sub { $opt_format = "file"; },
"mbox" => sub { $opt_format = "mbox"; },
@@ -269,6 +270,7 @@
'opt_all' => $opt_all,
'opt_head' => $opt_head,
'opt_tail' => $opt_tail,
+ 'opt_cache' => $opt_cache,
'opt_after' => $opt_after,
'opt_before' => $opt_before,
'opt_restart' => $opt_restart,