Mailing List Archive: svn commit: r494819 - /spamassassin/trunk/masses/logs-to-c

Author: jm
Date: Wed Jan 10 06:08:08 2007
New Revision: 494819

URL: http://svn.apache.org/viewvc?view=rev&rev=494819
Log:
port over the fast log-parsing code from hit-frequencies to logs-to-c, which also correctly deals with the (very uncommon) case of no rule hits whatsoever

Modified:
spamassassin/trunk/masses/logs-to-c

Modified: spamassassin/trunk/masses/logs-to-c
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/logs-to-c?view=diff&rev=494819&r1=494818&r2=494819
==============================================================================
--- spamassassin/trunk/masses/logs-to-c (original)
+++ spamassassin/trunk/masses/logs-to-c Wed Jan 10 06:08:08 2007
@@ -117,20 +117,6 @@
return map { $short_to_long[$_] } unpack("w*", $_[0]);
}

-# arguments are $isspam, $count, \@tests;
-sub log_line_code {
- $tests_hit[$_[1]] = freeze_tests($_[2]);
-
- if ($_[0]) {
- $num_spam++;
- vec($is_spam, $_[1], 1) = 1;
- }
- else {
- $num_ham++;
- vec($is_spam, $_[1], 1) = 0;
- }
-}
-
sub readlogs {
my $msgline;

@@ -143,22 +129,44 @@
my $isspam = ($file eq $opt_spam);
my $caught; # 1st parameter of log line
my $rules; # 4th parameter of log line
+ my $restofline; # intermediate parse buffer

while (defined($msgline = <IN>)) {
- ($caught, undef, undef, $rules) = split(' ', $msgline);
-
- # only take lines starting with Y or .
- next unless ($caught eq 'Y' || $caught eq '.') && $rules;
+ # faster log-reading code from hit-frequencies.
+ # the additional split() is for this case:
+ # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no"
+ # in other words, no hits. split(' ') cannot deal with this
+ # correctly, seeing (".", "-20", "/path", "time=...etc"). Work
+ # around this by using a literal / / regexp split to discard
+ # the csv stuff we don't want out of the rest of the line.
+
+ ($caught, undef, $restofline) = split(' ', $msgline, 3);
+ next unless ($caught =~ /^[Y\.]$/ && $restofline);
+ (undef, $rules) = split(/ /, $restofline, 3);

# get tests, but ignore unknown tests and subrules
my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
split(/,/, $rules);

- # run handler
- log_line_code($isspam, $count, \@tests);
+ if ($isspam) {
+ $num_spam++;
+ vec($is_spam, $count, 1) = 1;
+ }
+ else {
+ $num_ham++;
+ vec($is_spam, $count, 1) = 0;
+ }
+
+ # inlined for speed.
+ # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
+ $tests_hit[$count] = pack("w*", map
+ {
+ $long_to_short{$_} || new_short($_);
+ } @tests);
+
+ # TODO: benchmark using foreach(), map() is often slower

- # increment line
- $count++;
+ $count++; # increment line
}
close IN;
}