Mailing List Archive

svn commit: r453586 [5/7] - in /spamassassin/branches/jm_re2c_hacks: ./ build/ build/automc/ build/buildbot/ contrib/ lib/Mail/ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Bayes/ lib/Mail/SpamAssassin/BayesStore/ lib/Mail/SpamAssassin/Conf/ lib/Mail/S...
Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/PluginHandler.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/PluginHandler.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/PluginHandler.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/PluginHandler.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Reporter.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Reporter.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Reporter.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Reporter.pm Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
# Mail::SpamAssassin::Reporter - report a message as spam

# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SQLBasedAddrList.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SQLBasedAddrList.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SQLBasedAddrList.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SpamdForkScaling.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SpamdForkScaling.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SpamdForkScaling.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SpamdForkScaling.pm Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
# spamd prefork scaling, using an Apache-based algorithm
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@@ -121,11 +122,20 @@
$self->compute_lowest_child_pid();
}

+# this is called by SIGTERM and SIGHUP handlers, to ensure that new
+# kids aren't added while the main code is killing the old ones
+# and planning to exit.
+#
+sub set_exiting_flag {
+ my ($self) = @_;
+ $self->{am_exiting} = 1;
+}
+
sub child_error_kill {
my ($self, $pid, $sock) = @_;

- warn "prefork: killing failed child $pid ".
- ($sock ? "fd=".$sock->fileno : "");
+ warn "prefork: killing failed child $pid fd=".
+ ((defined $sock && defined $sock->fileno) ? $sock->fileno : "undefined");

# close the socket and remove the child from our list
$self->set_child_state ($pid, PFSTATE_KILLED);
@@ -138,7 +148,7 @@
$sock->close;
}

- warn "prefork: killed child $pid";
+ warn "prefork: killed child $pid\n";
}

sub set_child_state {
@@ -650,6 +660,9 @@

sub adapt_num_children {
my ($self) = @_;
+
+ # don't start up new kids while main is working at killing the old ones
+ return if $self->{am_exiting};

my $kids = $self->{kids};
my $statestr = '';

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SubProcBackChannel.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SubProcBackChannel.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SubProcBackChannel.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/SubProcBackChannel.pm Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
# back-channel for communication between a master and multiple slave processes.
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Timeout.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Timeout.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Timeout.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Timeout.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@@ -793,16 +794,23 @@
# Get the actual MIME type out ...
# Note: the header content may not be whitespace unfolded, so make sure the
# REs do /s when appropriate.
+ # correct:
+ # Content-type: text/plain; charset=us-ascii
+ # missing a semi-colon, CT shouldn't have whitespace anyway:
+ # Content-type: text/plain charset=us-ascii
#
- $ct =~ s/^\s+//; # strip leading whitespace
- $ct =~ s/;.*$//s; # strip everything after first ';'
- $ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@s; # only something/something ...
+ $ct =~ s/^\s+//; # strip leading whitespace
+ $ct =~ s/;.*$//s; # strip everything after first ';'
+ $ct =~ s@^([^/]+(?:/[^/\s]*)?).*$@$1@s; # only something/something ...
# strip inappropriate chars
$ct =~ tr/\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135//d;
$ct = lc $ct;

# bug 4298: If at this point we don't have a content-type, assume text/plain
- $ct ||= "text/plain";
+ # also, if the content-type is simply "text" or "text/", assume text/plain
+ if (!$ct || $ct =~ /^text\/?$/) {
+ $ct = "text/plain";
+ }

# Now that the header has been parsed, return the requested information.
# In scalar context, just the MIME type, in array context the
@@ -876,7 +884,7 @@

###########################################################################

-=item my ($filehandle, $filepath) = secure_tmpfile();
+=item my ($filepath, $filehandle) = secure_tmpfile();

Generates a filename for a temporary file, opens it exclusively and
securely, and returns a filehandle to the open file (opened O_RDWR).

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/DependencyInfo.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/DependencyInfo.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/DependencyInfo.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/DependencyInfo.pm Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
# Helper code to debug dependencies and their versions.

# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/Progress.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/Progress.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/Progress.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/Progress.pm Fri Oct 6 05:46:56 2006
@@ -1,9 +1,10 @@
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
# The (extremely complex) rules for domain delegation.

# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/lm/build.pl
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lm/build.pl?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lm/build.pl (original)
+++ spamassassin/branches/jm_re2c_hacks/lm/build.pl Fri Oct 6 05:46:56 2006
@@ -4,11 +4,12 @@
# *.ln = new format, uses NULL as separator
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-accuracy-curve
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-accuracy-curve?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-accuracy-curve (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-accuracy-curve Fri Oct 6 05:46:56 2006
@@ -6,14 +6,15 @@
# usage: graph-accuracy-curve [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-bayes-histogram
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-bayes-histogram?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-bayes-histogram (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/bayes-testing/graph-bayes-histogram Fri Oct 6 05:46:56 2006
@@ -6,14 +6,15 @@
# usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: spamassassin/branches/jm_re2c_hacks/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/corpora/mk-corpus-link-farm?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/corpora/mk-corpus-link-farm Fri Oct 6 05:46:56 2006
@@ -7,14 +7,15 @@
# cause breakage.
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: spamassassin/branches/jm_re2c_hacks/masses/enable-all-evolved-rules
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/enable-all-evolved-rules?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/enable-all-evolved-rules (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/enable-all-evolved-rules Fri Oct 6 05:46:56 2006
@@ -12,11 +12,12 @@
# required.
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/evolve_metarule.c
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/evolve_metarule.c?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/evolve_metarule.c (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/evolve_metarule.c Fri Oct 6 05:46:56 2006
@@ -2,14 +2,15 @@
* the NIGERIAN or ADVANCE_FEE rule.
*
* <@LICENSE>
- * Copyright 2005 Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at:
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/preproc.pl
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/preproc.pl?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/preproc.pl (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/evolve_metarule/preproc.pl Fri Oct 6 05:46:56 2006
@@ -1,13 +1,14 @@
#!/usr/bin/perl -w
# <@LICENSE>
-# Copyright 2005 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: spamassassin/branches/jm_re2c_hacks/masses/extract-message-from-mbox
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/extract-message-from-mbox?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/extract-message-from-mbox (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/extract-message-from-mbox Fri Oct 6 05:46:56 2006
@@ -1,10 +1,11 @@
#!/usr/bin/perl
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/find-extremes
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/find-extremes?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/find-extremes (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/find-extremes Fri Oct 6 05:46:56 2006
@@ -2,11 +2,12 @@

# hacked version of hit-frequencies - Allen
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/fp-fn-to-tcr
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/fp-fn-to-tcr?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/fp-fn-to-tcr (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/fp-fn-to-tcr Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/freqdiff
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/freqdiff?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/freqdiff (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/freqdiff Fri Oct 6 05:46:56 2006
@@ -3,11 +3,12 @@
# freqdiff - print frequency difference between two inputs
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/generate-translation
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/generate-translation?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/generate-translation (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/generate-translation Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl -w
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@@ -169,11 +170,12 @@
# See 'perldoc Mail::SpamAssassin::Conf' for details.
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl -w
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/lint-rules-from-freqs
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/lint-rules-from-freqs?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/lint-rules-from-freqs (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/lint-rules-from-freqs Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/logdiff
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/logdiff?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/logdiff (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/logdiff Fri Oct 6 05:46:56 2006
@@ -1,8 +1,10 @@
#!/usr/bin/perl

my ($l1, $l2, @diffargs);
+@diffargs = ();
+
foreach my $arg (@ARGV) {
- if (/^-/) {
+ if ($arg =~ /^-/) {
push @diffargs, $arg;
} elsif (defined $l1) {
$l2 = $arg;
@@ -24,12 +26,21 @@
open (IN, "<$inf") or die "cannot open $inf";
open (OUT, ">$outf") or die "cannot open $outf";
while (<IN>) {
- s/\bscantime=\d+/scantime=N/gs; # frequently different
- s/\bAWL\b//gs; # kill AWL hits
- s/ ,/ /gs;
- s/, / /gs;
- s/,,/,/gs;
- print OUT;
+ if (!/^([\.Y]\s+\S+\s+\S+)\s+(\S+)\s+(.+)/) {
+ print OUT;
+ next;
+ }
+
+ my ($scorepath, $rules, $meta) = ($1,$2,$3);
+ my @rules = split(/,/, $rules);
+ @rules = sort grep {
+ $_ !~ /^AWL$/
+ } @rules;
+ $rules = join(',', @rules);
+
+ $meta =~ s/\bscantime=\d+/scantime=N/gs; # frequently different
+
+ print OUT "$scorepath $rules $meta\n";
}
close IN;
close OUT;

Modified: spamassassin/branches/jm_re2c_hacks/masses/logs-to-c
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/logs-to-c?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/logs-to-c (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/logs-to-c Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl -w
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/mass-check?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/mass-check (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/mass-check Fri Oct 6 05:46:56 2006
@@ -1,11 +1,14 @@
#!/usr/bin/perl -w
+use strict;
+
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@@ -16,13 +19,15 @@
# limitations under the License.
# </@LICENSE>

+sub aidbg;
+
sub usage {
my $status = shift;

- my $out = $status ? STDERR : STDOUT;
+ my $out = $status ? \*STDERR : \*STDOUT;
print $out <<EOF;
usage: mass-check [options] target ...
-
+
-c=file set configuration/rules directory
-p=dir set user-prefs directory
-f=file read list of targets from <file>
@@ -41,7 +46,26 @@
were encapsulated by servers matching the regexp RE
(default = extract all SpamAssassin-encapsulated mails)
--lint check rules for syntax before running
-
+
+ client/server mode options
+ --server host:port
+ use server mode, running on the given hostname and port
+ --client host:port
+ use client mode, connecting to the given hostname and port
+ --cs_max N
+ at most, only ever request (client)/give out (server) a
+ maximum of N messages (defaults to 1000)
+ --cs_timeout N
+ in client mode, try to connect to the server every N seconds
+ defaults to 300
+ in server mode, timeout messages after N seconds
+ defaults to 60
+ --cs_paths_only
+ only used in client mode. when making requests of the
+ server, only ask for paths to the messages and not the
+ messages themselves. useful when the client and server
+ have the same paths to the corpus data.
+
log options
-o write all logs to stdout
--loghits log the text hit for patterns (useful for debugging)
@@ -49,7 +73,7 @@
--logmem log the memory delta (only on Linux)
--hamlog=log use <log> as ham log ('ham.log' is default)
--spamlog=log use <log> as spam log ('spam.log' is default)
-
+
message selection options
-n no date sorting or spam/ham interleaving
--cache use cache information when selecting messages
@@ -61,7 +85,7 @@
--all don't skip big messages
--head=N only check first N ham and N spam (N messages if -n used)
--tail=N only check last N ham and N spam (N messages if -n used)
-
+
simple target options (implies -o and no ham/spam classification)
--dir subsequent targets are directories
--file subsequent targets are files in RFC 822 format
@@ -95,12 +119,15 @@
$opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
$opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy
$total_messages $statusevery $opt_cachedir
- %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);
+ $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only
+ $opt_server %postdata %real $svn_revision
+ $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);

use FindBin;
use lib "$FindBin::Bin/../lib";
-use lib "$FindBin::Bin/tmp";
eval "use bytes";
+use IO::Select;
+use IO::Socket;
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Logger;
@@ -109,6 +136,7 @@
use Getopt::Long;
use POSIX qw(strftime);
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
+use constant HAS_IO_ZLIB => eval { require IO::Zlib; };
use Config;

# default settings
@@ -128,6 +156,8 @@
"rules=s", "restart=i", "after=s", "before=s", "loguris",
"deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
"cachedir=s", "noisy",
+ "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only",
+ "client=s",
"dir" => sub { $opt_format = "dir"; },
"file" => sub { $opt_format = "file"; },
"mbox" => sub { $opt_format = "mbox"; },
@@ -135,29 +165,28 @@
"help" => sub { usage(0); },
'<>' => \&target) or usage(1);

-# rules.pl is for the --reuse option, score set doesn't matter
-if ($opt_reuse && ! -f "$FindBin::Bin/tmp/rules.pl") {
- # some people specify paths relatively, whereas this needs an absolute path,
- # so "do the right thing"(tm).
- my $abs_opt_c = File::Spec->rel2abs($opt_c);
- system("cd $FindBin::Bin; perl parse-rules-for-masses -d $abs_opt_c");
+# We need IO::Zlib for client-server mode!
+if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) {
+ die "IO::Zlib required for client/server mode!\n";
}

-require "rules.pl" if $opt_reuse;
+# rules.pl is for the --reuse option, score set doesn't matter
+if ($opt_reuse) {
+ my $rules_path = "$FindBin::Bin/tmp/rules.pl";
+ if (! -f $rules_path) {
+ # some people specify paths relatively, whereas this needs an absolute path,
+ # so "do the right thing"(tm).
+ my $abs_opt_c = File::Spec->rel2abs($opt_c);
+ system("cd $FindBin::Bin; perl parse-rules-for-masses -d $abs_opt_c");
+ }
+
+ require $rules_path;
+}

if ($opt_noisy) {
$opt_progress = 1; # implies --progress
}

-# test messages for the mass-check
-my @targets;
-if ($opt_f) {
- open(F, $opt_f) || die "cannot read target $opt_f: $!";
- push(@targets, map { chomp; $_ } <F>);
- close(F);
-}
-usage(1) if !@targets;
-
$opt_debug ||= 'all' if defined $opt_debug;

my $user_prefs = "$opt_p/user_prefs";
@@ -167,7 +196,6 @@
# but since it's probably not expecting that, and we don't want
# strange things happening, create a local object.
if ($opt_lint) {
-
my $spamlint = new Mail::SpamAssassin ({
'debug' => $opt_debug,
'rules_filename' => $opt_c,
@@ -178,13 +206,12 @@
'dont_copy_prefs' => 1,
'local_tests_only' => $opt_net ? 0 : 1,
'only_these_rules' => $opt_rules,
- 'ignore_safety_expire_timeout' => 1,
- PREFIX => '',
+ 'ignore_safety_expire_timeout' => 1,
+ PREFIX => '',
DEF_RULES_DIR => $opt_c,
LOCAL_RULES_DIR => '',
});

-
$spamlint->debug_diagnostics();
my $res = $spamlint->lint_rules();
$spamlint->finish();
@@ -192,7 +219,18 @@
exit 1 if $res;
}

-$spamtest = new Mail::SpamAssassin ({
+# test messages for the mass-check
+my @targets;
+if (!$opt_server && !$opt_client) {
+ if ($opt_f) {
+ open(F, $opt_f) || die "cannot read target $opt_f: $!";
+ push(@targets, map { chomp; $_ } <F>);
+ close(F);
+ }
+ usage(1) if !@targets;
+}
+
+my $spamtest = new Mail::SpamAssassin ({
'debug' => $opt_debug,
'rules_filename' => $opt_c,
'userprefs_filename' => $user_prefs,
@@ -214,7 +252,6 @@
# generated user_prefs
if ($opt_reuse) {
# copy current prefs if it exists
-
$spamtest->copy_config(undef, \%orig_conf);

# zeroed scores to mass_prefs
@@ -239,13 +276,13 @@
chomp $where;
chomp $when;
chomp $host;
-my $revision = get_current_svn_revision();
+$svn_revision = get_current_svn_revision();
my $cmdline = join(' ',@ORIG_ARGV); $cmdline =~ s/\s+/ /gs;
my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better

my $log_header = "# mass-check results from $who\@$where, on $when\n" .
"# M:SA version ".$spamtest->Version()."\n" .
- "# SVN revision: $revision\n" .
+ "# SVN revision: $svn_revision\n" .
"# Date: $isowhen\n" .
"# Perl version: $] on $Config{archname}\n" .
"# Switches: '$cmdline'\n";
@@ -260,60 +297,141 @@
my $showdots_counter = 0;
my $showdots_every = ($opt_showdots ? 1 : 20);

-# Deal with --rewrite
-if (defined $opt_rewrite) {
- my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out");
- open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!";
-}
+my $AIopts = {
+ 'opt_all' => $opt_all,
+ };

-# Deal with --before and --after
-foreach my $time ($opt_before, $opt_after) {
- if ($time && $time =~ /^-\d+$/) {
- $time = time + $time;
+if (!$opt_client) {
+ # Deal with --rewrite
+ if (defined $opt_rewrite) {
+ my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out");
+ open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!";
}
- elsif ($time && $time !~ /^-?\d+$/) {
- if (HAS_TIME_PARSEDATE) {
- $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
- }
- else {
- die "You need Time::ParseDate if you use either the --before or --after option.";
+
+ # Deal with --before and --after
+ foreach my $time ($opt_before, $opt_after) {
+ if ($time && $time =~ /^-\d+$/) {
+ $time = time + $time;
+ }
+ elsif ($time && $time !~ /^-?\d+$/) {
+ if (HAS_TIME_PARSEDATE) {
+ $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
+ }
+ else {
+ die "You need Time::ParseDate if you use either the --before or --after option.";
+ }
}
}
+
+ if ($opt_before && $opt_after && $opt_after >= $opt_before) {
+ die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
+ }
+
+ # ArchiveIterator options for non-client mode
+ $AIopts->{'opt_n'} = $opt_n;
+ $AIopts->{'opt_head'} = $opt_head;
+ $AIopts->{'opt_tail'} = $opt_tail;
+ $AIopts->{'opt_cache'} = $opt_cache;
+ $AIopts->{'opt_cachedir'} = $opt_cachedir;
+ $AIopts->{'opt_after'} = $opt_after;
+ $AIopts->{'opt_before'} = $opt_before;
+ $AIopts->{'scan_progress_sub'} = \&showdots_blip;
+}
+else {
+ # ArchiveIterator options for client mode -- tends to be simple
+ $AIopts->{'opt_n'} = 1;
}

-if ($opt_before && $opt_after && $opt_after >= $opt_before) {
- die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
+###########################################################################
+## SCAN MODE
+
+my $iter = new Mail::SpamAssassin::ArchiveIterator($AIopts);
+
+# setup the AI functions
+if ($opt_client) {
+ $iter->set_functions(\&wanted, \&result_client);
+}
+elsif ($opt_server) {
+ $iter->set_functions(\&wanted_server, \&result);
+}
+else {
+ $iter->set_functions(\&wanted, \&result);
}

-my $iter = new Mail::SpamAssassin::ArchiveIterator({
- 'opt_j' => $opt_j,
- 'opt_n' => $opt_n,
- 'opt_all' => $opt_all,
- 'opt_head' => $opt_head,
- 'opt_tail' => $opt_tail,
- 'opt_cache' => $opt_cache,
- 'opt_cachedir' => $opt_cachedir,
- 'opt_after' => $opt_after,
- 'opt_before' => $opt_before,
- 'opt_restart' => $opt_restart,
- 'scan_progress_sub' => \&scan_progress_cb
-});
+my $messages;
+
+# normal mode as well as a server do scan mode and get a temp file
+if (!$opt_client) {
+ status('starting scan stage') if ($opt_progress);
+
+ # Make a temp file and delete it
+ my $tmpf;
+ ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
+ die 'mass-check: failed to create temp file' unless $tmpf;
+ unlink $tmpf or die "mass-check: unlink '$tmpf': $!";
+
+ # having opt_j or server mode means do scan in a separate process
+ if ($opt_server || $opt_j) {
+ if ($tmpf = fork()) {
+ # parent
+ waitpid($tmpf, 0);
+ }
+ elsif (defined $tmpf) {
+ # child -- process using message_array
+ my($num, $messages) = $iter->message_array(\@targets);
+
+ # Dump out the number of messages and the message index info to
+ # the temp file
+ send_line($tmpfd, $num, @{$messages});
+
+ exit;
+ }
+ else {
+ die "mass-check: cannot fork: $!";
+ }
+ }
+ else {
+ # we get here if opt_j == 0, so scan in this process
+ my($num, $messages) = $iter->message_array(\@targets);
+
+ # Dump out the number of messages and the message index info to
+ # the temp file
+ send_line($tmpfd, $num, @{$messages});
+ }
+
+ # we now have a temporary file with the messages to process
+ seek($tmpfd, 0, 0);
+ # the first line is the number of messages
+ $total_messages = read_line($tmpfd);

-if ($opt_progress) {
- status('starting scan stage');
+ if (!$total_messages) {
+ die "mass-check: no messages to process\n";
+ }
+
+ status("completed scan stage, $total_messages messages") if ($opt_progress);
}

-sub scan_progress_cb {
- showdots_blip();
+###########################################################################
+## RUN MODE
+
+if ($opt_client) {
+ client_mode();
}
+else {
+ status('starting run stage') if ($opt_progress);

-$iter->set_functions(\&wanted, \&result);
-$iter->run(@targets);
+ if ($opt_server) {
+ server_mode();
+ }
+ else {
+ run_through_messages();
+ }

-if ($opt_progress) {
- status('completed run stage');
+ status('completed run stage') if ($opt_progress);
}

+# Even though we're about to exit, let's clean up after ourselves
+close($tmpfd) if ($tmpfd);
showdots_finish();

if (defined $opt_rewrite) {
@@ -341,25 +459,23 @@
###########################################################################

sub init_results {
+ $init_results = 1;
+
showdots_finish();

# now, showdots only happens if --showdots was used
$showdots_active = $opt_showdots;

if ($opt_progress) {
- # make it a local variable for now
- $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
-
# round up since 100% will be caught at end already
$statusevery = int($total_messages / $updates + 1);

# if $messages < $updates, just give a status line per msg.
$statusevery ||= 1;
-
- status("completed scan stage, $total_messages messages");
- status('starting run stage');
}

+ return if $opt_client;
+
if ($opt_o) {
autoflush STDOUT 1;
print STDOUT $log_header;
@@ -372,14 +488,13 @@
print HAM $log_header;
print SPAM $log_header;
}
- $init_results = 1;
}

sub result {
my ($class, $result, $time) = @_;

# don't open results files until we get here to avoid overwriting files
- &init_results if !$init_results;
+ init_results() if !$init_results;

if ($class eq "s") {
if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
@@ -391,7 +506,6 @@
}

$total_count++;
-#warn ">> result: $total_count $class $time\n";

if ($opt_progress) {
progress($time);
@@ -400,10 +514,32 @@

sub wanted {
my ($class, $id, $time, $dataref, $format) = @_;
- my $out;
+ my $out = '';
+
+ # if origid is defined, it'll be the message number from server mode
+ my $origid;
+
+ # client mode is a little crazy because we need to kluge around the fact
+ # that the information needed to do the run is different than the
+ # information that goes into the results.
+ if ($opt_client) {
+ if ($opt_cs_paths_only) {
+ # the server message number
+ $origid = $real{$id};
+ }
+ else {
+ # if we're a non-paths_only client, change the format and id to the real
+ # version, make sure to remember the server's message number
+ $origid=$id;
+ $origid =~ s/^.+?(\d+)$/$1/;
+ $format = $real{$id}->[2];
+ $id = $real{$id}->[3];
+ }
+ }

memory_track_start() if ($opt_logmem);

+ # parse the message, and force it to complete
my $ma = $spamtest->parse($dataref, 1);

# remove SpamAssassin markup, if present and the mail was spam
@@ -488,7 +624,7 @@
if (defined $spam) {
my $result = ($spam ? "spam" : "ham");
my $status = $spamtest->learn($ma, undef, $spam, 0);
- $learned = $status->did_learn();
+ my $learned = $status->did_learn();
$result = "undef" if !defined $learned;
push(@extra, "learn=".$result);
}
@@ -532,6 +668,10 @@
push(@extra, "reuse=no");
}

+ if ($opt_client) {
+ push(@extra, "host=$where");
+ }
+
my $yorn;
my $score;
my $tests;
@@ -569,6 +709,12 @@

$id =~ s/\s/_/g;

+ # if we have an origid set, it'll be the server mode's message number, so
+ # attach it to our result appropriately.
+ if (defined $origid) {
+ $out = "$origid ";
+ }
+
$out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);

if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
@@ -595,7 +741,15 @@
undef $ma; # clean 'em up
undef $status;

+ # uncomment these lines to get a Data::Dumper dump of the Mail::SpamAssassin
+ # module, written to a file after each message is scanned. This is a
+ # great way to find memory leaks...
+ ## use Data::Dumper;
+ ## open (D, ">dump.$$.$total_count"); print D Dumper($spamtest); close D;
+ ## warn "wrote memory dump: dump.$$.$total_count";
+
showdots_blip();
+# print ">>>> out = $out\n";
return $out;
}

@@ -717,7 +871,7 @@
}
}
}
- $str;
+ return $str;
}

sub get_current_svn_revision {
@@ -756,4 +910,930 @@
}

return $revision || "unknown";
+}
+
+############################################################################
+
+## children processors, start and process, used when opt_j > 1
+
+sub start_children {
+ my ($count, $child, $pid, $socket) = @_;
+
+ my $io = IO::Socket->new();
+ my $parent;
+
+ # create children
+ for (my $i = 0; $i < $count; $i++) {
+ ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC)
+ or die "mass-check: socketpair failed: $!";
+ if ($pid->[$i] = fork) {
+ close $parent;
+
+ # disable caching for parent<->child relations
+ my ($old) = select($child->[$i]);
+ $|++;
+ select($old);
+
+ $socket->add($child->[$i]);
+ aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n";
+ next;
+ }
+ elsif (defined $pid->[$i]) {
+ my $result;
+ my $line;
+
+ close $tmpfd if defined $tmpfd;
+
+ close $child->[$i];
+ select($parent);
+ $| = 1; # print to parent by default, turn off buffering
+ send_line($parent,"START");
+ while ($line = read_line($parent)) {
+ if ($line eq "exit") {
+ close $parent;
+ exit;
+ }
+
+ my($class, $format, $date, $where, $result) = $iter->run_message($line);
+ $result ||= '';
+
+ # If determine_receive_date is not set, the original input date
+ # wasn't calculated, but run_message would have done so, so reset
+ # the packed version if possible ... use defined for date since
+ # it could == 0.
+ if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) {
+ $line = Mail::SpamAssassin::ArchiveIterator::index_pack($date, $class, $format, $where);
+ }
+
+ send_line($parent,"$result\0RESULT $line");
+ }
+ exit;
+ }
+ else {
+ die "mass-check: cannot fork: $!";
+ }
+ }
+}
+
+## handling killing off the children
+
+sub reap_children {
+ my ($count, $socket, $pid) = @_;
+
+ # If the child died, sending it the exit will generate a SIGPIPE, but we
+ # don't really care since the readline will go undef (which is fine),
+ # then we do the waitpid which will finish it off. So we end up in the
+ # right state, in theory.
+ local $SIG{'PIPE'} = 'IGNORE';
+
+ for (my $i = 0; $i < $count; $i++) {
+ aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n";
+ send_line($socket->[$i],"exit"); # tell the child to die.
+ close $socket->[$i];
+ waitpid($pid->[$i], 0); # wait for the signal ...
+ }
+}
+
+# in server mode, this gets called to read in the HTTP request from a given
+# socket, then return the information the client sent to us.
+sub handle_http_request {
+ my $socket = shift;
+
+ my $headers = {};
+ my $postdata = {};
+
+ # read in the request (POST / HTTP/1.0)
+ my $line = $socket->getline();
+ $line ||= '';
+ $line =~ s/\r\n$//;
+
+ my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/;
+ unless ($type && $URI && $VERS) {
+ $type ||= '';
+ $URI ||= '';
+
+ return ($type, $URI, $headers, $postdata);
+ }
+
+ $type = uc $type;
+
+ # read in headers, "key: value" up to a blank line
+ do {
+ $line = $socket->getline();
+ last unless defined $line;
+ $line =~ s/\r\n$//;
+
+ if ($line) {
+ my ($k,$v) = split(/:\s*/, $line, 2);
+ $headers->{lc $k} = $v;
+ }
+ } while ($line !~ /^$/);
+
+ # if this is a POST request w/ content-length, there'll be a payload, deal
+ # with it.
+ if ($type eq 'POST' && $headers->{'content-length'}) {
+ my $pd;
+ $socket->read($pd, $headers->{'content-length'});
+ $pd =~ s/[\r\n]+$//; # a hack for manual requests/telnet/etc
+
+ # key1=value1&key2=value2...
+ %{$postdata} = map {
+ my($k,$v) = split(/=/, $_, 2);
+
+ # we need to decode the key and value
+ $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;
+ $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;
+
+ $k => $v;
+ } split(/\&/, $pd);
+ }
+
+ return($type, $URI, $headers, $postdata);
+}
+
+# in server mode, generate a gzip compressed data stream with the messages and
+# return the path to the compressed file which the server will read and pass
+# to the client.
+#
+# Input:
+# - Number of messages to generate (scalar)
+# - Hash of Arrays of outstanding requests (reference to hash of array refs)
+# timestamp# -> [ num1, num2, ... ]
+# Used to quickly find outstanding/timed out messages to send to client.
+# - Hash of outstanding messages and associated data (ref to hash of hash refs)
+# num1 -> { data => 'binary data from scan mode', timestamp => timestamp# }
+# Used later on to specify the timestamp entry to remove the entry from.
+# - Paths only? If true, just include the original message data in the gzip
+# file. Otherwise, include the message data. Useful if the client has the
+# corpus available via the same paths as originally specified.
+#
+# Returns: scalar path to gzip file
+#
+sub generate_messages {
+ my($msgs, $timestamps, $msgsout, $paths_only) = @_;
+
+ # Hold the message numbers we'll be sending out
+ my @tosend = ();
+
+ # Find out if any of the messages we sent out before need to be sent out
+ # again because we haven't seen a response within the timeout.
+ my $tooold = time - $opt_cs_timeout;
+ foreach (sort { $a <=> $b } keys %{$timestamps}) {
+ # since we're going in numeric order, if the current entry is newer than
+ # the timeout value, the rest will be too, so stop looking.
+ last if ($_ > $tooold);
+
+ # how many messages do we still need to fulfill the request?
+ my $wanted = $msgs - @tosend;
+
+ if (@{$timestamps->{$_}} > $wanted) {
+ # there are more entries in the timestamp list than we want, so just
+ # grab that many off the list.
+ push(@tosend, splice @{$timestamps->{$_}}, 0, $wanted);
+ }
+ else {
+ # there are just enough, or not enough entries on the timestamp list to
+ # satisfy our request, so take them all and we'll loop around.
+ push(@tosend, @{$timestamps->{$_}});
+ delete $timestamps->{$_};
+ }
+
+ # Ok, we have enough messages so we can stop now.
+ last if (@tosend == $msgs);
+ }
+
+ # if we still have the temp file with the input messages open, we'll fillup
+ # out message output queue with messages from there.
+ if ($tmpfd) {
+ while (@tosend < $msgs) {
+ my $msg = read_line($tmpfd);
+
+ # no more messages from the temp file, close it out
+ unless ($msg) {
+ delete $msgsout->{'curnum'};
+ close $tmpfd;
+ undef $tmpfd;
+ last;
+ }
+
+ # we got a result, so assign it a number (curnum) and store the data
+ # appropriately, then add the new number to the queue.
+ my $num = $msgsout->{'curnum'}++;
+ $msgsout->{$num}->{'data'} = $msg;
+ push(@tosend, $num);
+ }
+ }
+
+ # ok, at this point, @tosend ought to have a list of numbers, pointers into
+ # %{$msgsout}. turn that into a tar file.
+ return '' unless @tosend;
+
+ my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
+ die "Can't make tempfile, exiting" unless $gzpath;
+ close($gzfd);
+
+ $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!";
+
+ # first line is the number of messages included in the file
+ send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n";
+
+ # Generate an archive in the temp file
+ foreach my $num (@tosend) {
+ # Archive format, gzip compressed file w/ 3 parts per message:
+ # 1- server message number in text format
+ # 2- server index string, binary packed format
+ # 3- message content -- unless paths_only
+ send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
+
+ my $data = $msgsout->{$num}->{'data'};
+ send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
+
+ if (!$paths_only) {
+ my $msg = ($iter->run_message($data))[4];
+ send_line($gzfd, join('', @{$msg})) ||
+ die "mass-check: error when writing to gz temp file\n";
+ }
+ }
+
+ $gzfd->close;
+
+ # update timestamp entries
+ my $ts = time;
+ foreach (@tosend) {
+ $msgsout->{$_}->{'timestamp'} = $ts;
+ }
+
+ # conveniently, this list should be the only thing sent out w/ this
+ # timestamp, so just set the reference appropriately. :)
+ $timestamps->{$ts} = \@tosend;
+
+ if ($opt_noisy) {
+ print "generated ".scalar(@tosend)." messages\n";
+ }
+
+ return $gzpath;
+}
+
+# we've gotten results posted, so clean up msgsout and timestamp hashes and
+# process result...
+sub handle_post_results {
+ my($postdata, $timestamps, $msgsout) = @_;
+
+ # local version to batch the removals
+ my %timestamps = ();
+
+ # $msgsout->{num}->{data|timestamp}
+ # $timestamp{num} = [ msgout_nums ... ]
+ # $postdata{num} = result_string
+
+ while( my($k,$v) = each %{$postdata} ) {
+ # message run results will be \d+ => log entry
+ next if ($k !~ /^\d+$/);
+
+ # if we've been waiting for this result, process it, otherwise throw it on
+ # the ground. multiple clients could have been given the same messages to
+ # process, and we take whatever the first responder sends us.
+ if (exists $msgsout->{$k}) {
+ # the result_sub will need parts of the message data, so get it ready
+ my @d = Mail::SpamAssassin::ArchiveIterator::index_unpack($msgsout->{$k}->{'data'});
+
+ # go ahead and do the result
+ &{$iter->{result_sub}}($d[1], $v, $d[0]);
+
+ # prep to get rid of the cached entries
+ $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1;
+ delete $msgsout->{$k};
+ }
+ }
+
+ # if we got any results, clean out the results from the timestamp arrays
+ while ( my($k,$v) = each %timestamps ) {
+ # trim out the result list from the timestamp sent list
+ my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}});
+
+ # if there are results left for a specific timestamp, update the array
+ # pointer. otherwise, delete the timestamp entry since it's empty.
+ if (@temp) {
+ $timestamps->{$k} = \@temp;
+ }
+ else {
+ delete $timestamps->{$k};
+ }
+ }
+}
+
+# This function reads from $tmpfd and processes the message as appropriate wrt
+# $opt_j, $opt_restart, etc.
+#
+sub run_through_messages {
+ # do everything in one process
+ if ($opt_j <= 1 && !defined $opt_restart) {
+ my $message;
+ my $messages;
+ my $total_count = 0;
+
+ while (($total_messages > $total_count) && ($message = read_line($tmpfd))) {
+ my($class, undef, $date, undef, $result) = $iter->run_message($message);
+ if ($result) {
+ &{$iter->{result_sub}}($class, $result, $date);
+ }
+ $total_count++;
+ }
+ }
+ # more than one process or one process with restarts
+ else {
+ my $select = IO::Select->new();
+
+ my $total_count = 0;
+ my $needs_restart = 0;
+ my @child = ();
+ my @pid = ();
+ my $messages;
+
+ # start children processes
+ start_children($opt_j, \@child, \@pid, $select);
+
+ # feed childen, make them work for it, repeat
+ while ($select->count()) {
+ foreach my $socket ($select->can_read()) {
+ my $line = read_line($socket);
+
+ # some error happened during the read!
+ if (!defined $line) {
+ $needs_restart = 1;
+ warn "mass-check: readline failed, attempting to recover\n";
+ $select->remove($socket);
+ }
+ elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) {
+ my $result = $1;
+ my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::index_unpack($2);
+ aidbg "mass-check: $class, $type, $date\n";
+
+ if (defined $opt_restart && ($total_count % $opt_restart) == 0) {
+ $needs_restart = 1;
+ }
+
+ # if messages remain, and we don't need to restart, send message
+ if (($total_messages > $total_count) && !$needs_restart) {
+ send_line($socket, read_line($tmpfd));
+ $total_count++;
+ aidbg "mass-check: $total_messages $total_count\n";
+ }
+ else {
+ # stop listening on this child since we're done with it
+ aidbg "mass-check: $needs_restart $total_messages $total_count\n";
+ $select->remove($socket);
+ }
+
+ # deal with the result we received
+ if ($result) {
+ &{$iter->{result_sub}}($class, $result, $date);
+ }
+ }
+ elsif ($line eq "START") {
+ if ($total_messages > $total_count) {
+ # we still have messages, send one to child
+ send_line($socket, read_line($tmpfd));
+ $total_count++;
+ aidbg "mass-check: $total_messages $total_count\n";
+ }
+ else {
+ # no more messages, so stop listening on this child
+ aidbg "mass-check: $needs_restart $total_messages $total_count\n";
+ $select->remove($socket);
+ }
+ }
+ else {
+ $needs_restart = 1;
+ warn "mass-check: bad line from readline: $line\n";
+ $select->remove($socket);
+ }
+ }
+
+ aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n";
+
+ # If there are still messages to process, and we need to restart
+ # the children, and all of the children are idle, let's go ahead.
+ if ($needs_restart && $select->count == 0 && $total_messages > $total_count) {
+ $needs_restart = 0;
+
+ aidbg "mass-check: needs restart, $total_messages total, $total_count done\n";
+ reap_children($opt_j, \@child, \@pid);
+ @child=();
+ @pid=();
+ start_children($opt_j, \@child, \@pid, $select);
+ }
+ }
+
+ # reap children
+ reap_children($opt_j, \@child, \@pid);
+ }
+}
+
+# send an HTTP response to a socket based on the input result, headers, and
+# data values.
+sub http_response {
+ my($socket, $result, $headers, $data) = @_;
+
+ print $socket
+ "HTTP/1.0 $result\r\n",
+ "Pragma: no-cache\r\n",
+ "Server: mass-check/$svn_revision\r\n",
+ map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers};
+ print $socket "\r\n";
+ print $socket $data;
+}
+
+# the client needs to make a request to the server on a given socket.
+sub http_make_request {
+ my($socket, $type, $uri, $headers, $data) = @_;
+
+ print $socket
+ "$type $uri HTTP/1.0\r\n",
+ "User-Agent: mass-check/$svn_revision\r\n",
+ map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers};
+ print $socket "\r\n";
+ print $socket $data;
+
+ # parse the response that the server sends us
+ my $line = $socket->getline() || '';
+ my(undef, $code, $string) = split(/\s+/, $line, 3);
+ return unless $code == 200;
+
+ my %headers = ();
+ do {
+ $line = $socket->getline();
+ last unless defined $line;
+ $line =~ s/\r\n$//;
+
+ if ($line) {
+ my ($k,$v) = split(/:\s*/, $line, 2);
+ $headers{lc $k} = $v;
+ }
+ } while ($line !~ /^$/);
+
+ # the server has sent us notification that it's going to exit, so let's
+ # follow suit.
+ return 'finished' if ($headers{'finished'});
+
+ my $gzpath = '';
+ if ($headers{'content-length'}) {
+ my $gzfd;
+ ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
+ die "Can't make tempfile, exiting" unless $gzpath;
+
+ my $rd;
+ $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n";
+ print $gzfd $rd;
+ close $gzfd;
+ }
+
+ $socket->close();
+ return $gzpath;
+}
+
+# Be conservative -- encode most things.
+# we could encode spaces to plusses, then decode that later, but...
+sub post_encode {
+ my $string = shift;
+ $string =~ s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx;
+ return $string;
+}
+
+# remove all of the files in a given directory, non-recursive
+sub clean_dir {
+ my $dir = shift;
+
+ unless (opendir(DIR, $dir)) {
+ warn "error: can't opendir $dir: $!\n";
+ return;
+ }
+ while(my $file = readdir(DIR)) {
+ $file =~ /^(.+)$/; # untaint
+ $file = $1;
+
+ my $path = File::Spec->catfile($dir, $file);
+ next unless (-f $path);
+
+ if (!unlink $path) {
+ warn "error: can't remove file $path: $!\n";
+ closedir(DIR);
+ return;
+ }
+ }
+ closedir(DIR);
+ return 1;
+}
+
+############################################################################
+
+# four bytes in network/vax format (little endian) as length of message
+# the rest is the actual message
+
+sub read_line {
+ my $fd = shift;
+ my($length,$msg);
+
+ # read in the 4 byte length and unpack
+ $fd->read($length, 4) || return;
+
+ $length = unpack("V", $length);
+ return unless $length;
+
+ # read in the rest of the single message
+ $fd->read($msg, $length) || return;
+
+ return $msg;
+}
+
+sub send_line {
+ my $fd = shift;
+ foreach ( @_ ) {
+ my $length = pack("V", length $_);
+ $fd->print($length.$_) || return 0;
+ }
+
+ return 1;
+}
+
+############################################################################
+
+# this is the function that implemented server mode. basically, sit and wait
+# for connections to come in. when a client sends in a request, deal with any
+# results that the client sent, then generate a response and send it back,
+# and then go back to waiting. lather, rinse, repeat.
+sub server_mode {
+ $opt_cs_max ||= 1000;
+ $opt_cs_timeout ||= 60 * 5;
+
+ my $serv_socket = IO::Socket::INET->new(
+ LocalAddr => $opt_server,
+ Proto => 'tcp',
+ Listen => 5,
+ ReuseAddr => 1,
+ );
+
+ die "Could not create socket: $!\n" unless $serv_socket;
+
+ if ($opt_progress) {
+ status('server ready for connections');
+ }
+
+ # Setup out "what messages have been sent out" hashes
+ my $timestamps = {};
+ my $msgsout = { 'curnum' => 0 };
+
+ # Generate an IO::Select object and put the server socket on the queue
+ my $select = IO::Select->new( $serv_socket );
+
+ # We'll keep looping while there's something to pay attention to
+ while ($select->count()) {
+ # Sit and block until there's something for us to read from
+ foreach my $socket ($select->can_read()) {
+ if ($socket == $serv_socket) {
+ # it's the server socket, go ahead and accept the connection and add
+ # it to the queue.
+ $select->add($serv_socket->accept);
+ }
+ else {
+ # it's some client, so deal with the request
+ my($type, $URI, $headers, $postdata) = handle_http_request($socket);
+
+ # we don't do GET, so just send something back
+ if ($type eq 'GET') {
+ if ($opt_noisy) {
+ print "GET request from ".$socket->peerhost."\n";
+ }
+
+ http_response($socket, "200 OK", {
+ 'Content-type' => 'text/plain',
+ },
+ "Your GET request came from IP Address: ".$socket->peerhost."\n");
+ }
+ elsif ($type eq 'POST') {
+ # ooh, POST. deal with any results that the client sent
+ handle_post_results($postdata, $timestamps, $msgsout);
+
+ if ($opt_noisy) {
+ print "POST request from ".$socket->peerhost."\n";
+ }
+
+ # based on the number of messages that the client requested,
+ # generate a gzip file with the appropriate data in it
+ my $messages = '';
+ if ($postdata->{'max_messages'}) {
+ my $msgnum = $postdata->{'max_messages'};
+ if ($msgnum > $opt_cs_max || $msgnum < 1) {
+ $msgnum = $opt_cs_max;
+ }
+
+ if ($opt_noisy) {
+ print "client requested ".$postdata->{'max_messages'}." messages\n";
+ }
+
+ $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'});
+ }
+
+ # $messages will contain the path to the gzip file if there are
+ # messages to send out.
+ if ($messages && open(MSG, $messages)) {
+ binmode(MSG);
+ local $/ = undef; # go go slurp mode
+
+ # send the response
+ http_response($socket, "200 OK", {
+ 'Content-Type' => 'application/x-gzip',
+ 'Content-Encoding' => 'x-gzip',
+ "Content-Length" => (-s $messages),
+ },
+ scalar <MSG>);
+
+ close(MSG);
+
+ # we don't need the file anymore, so get rid of it
+ unlink $messages;
+ }
+ elsif (!keys %{$msgsout} && !defined $tmpfd) {
+ # we have no more outstanding messages and our original queue of
+ # messages to process is empty, so tell the client to exit.
+ http_response($socket, "200 OK", {
+ "Content-type" => "text/plain",
+ "Finished" => 1,
+ },
+ 'We are all done');
+ }
+ else {
+ # when in doubt, treat this like a GET
+ http_response($socket, "200 OK", {
+ "Content-type" => "text/plain",
+ },
+ "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n");
+ }
+ }
+ else {
+ # for error, "501 Not Implemented"
+ http_response($socket, '501 Not Implemented', {}, '');
+ }
+
+ # ok, we don't do keepalive, so get rid of the socket
+ $select->remove($socket);
+ $socket->close;
+ }
+ }
+
+ if ($opt_noisy) {
+ print scalar(keys %{$msgsout})." messages outstanding\n";
+ }
+
+
+#print "msgs waiting: ".join(" ", keys %{$msgsout})."\n";
+#print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n";
+
+ # we're not awaiting responses and we've exhausted the input file, so
+ # drop the server socket. :)
+ $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd);
+ }
+}
+
+# this is the function that implements client mode. generally, in a loop:
+# make a request of the server for some max number of messages, and send our
+# results back at the same time. based on the results of that request, put
+# messages into a temp dir and process them. prep the results and loop.
+# lather, rinse, repeat.
+sub client_mode {
+ $opt_cs_max ||= 1000;
+ $opt_cs_timeout ||= 60 * 2;
+
+ my($host, $uri);
+
+ if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) {
+ ($host, $uri) = ($1,$2);
+ }
+ else {
+ $host = $opt_client;
+ if ($host =~ /^:/) {
+ $host = 'localhost'.$host;
+ }
+ }
+ my($http_host) = split(/:/, $host);
+
+ die "No host found in opt_client" unless $host;
+ $uri ||= "/";
+
+ # use this to track how many messages we ought to be requesting
+ my $msgnum = $opt_cs_max;
+
+ my $tmpdir;
+
+ # if we're not doing paths_only, create a temp dir where we'll put the
+ # incoming messages to process.
+ if (!$opt_cs_paths_only) {
+ $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
+ die "Can't create tempdir" unless $tmpdir;
+ }
+
+ # keep going until something stops us.
+ while (1) {
+ # if the number of messages to request is too much, bring it down
+ $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max);
+
+ # prep the POST request
+ $postdata{'max_messages'} = $msgnum;
+ $postdata{'paths_only'} = 1 if ($opt_cs_paths_only);
+
+ # the actual POST data string
+ my $POSTDATA = join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata);
+
+ # connect to server
+ my $socket = IO::Socket::INET->new($host);
+
+ # last if connection fails
+ last unless ($socket);
+
+ print "Requesting $msgnum messages from server\n" if ($opt_noisy);
+
+ # make request, include and then drop results if there are any
+ my $result = http_make_request($socket, 'POST', $uri, {
+ 'Host' => $http_host,
+ 'Content-Type' => 'application/x-www-form-urlencoded',
+ 'Content-Length' => length($POSTDATA),
+ },
+ $POSTDATA
+ );
+ %postdata = ();
+ undef $POSTDATA;
+
+ # If we received messages to run through, go ahead and do it.
+ # otherwise, just sleep for the timeout length and try again
+ if (!defined $result) {
+ # we got an error?!? abort!
+ last;
+ }
+ elsif ($result eq 'finished') {
+ # the server said that we're done
+ print "Server states that there is no more work, exiting.\n" if ($opt_noisy);
+ last;
+ }
+ elsif ($result eq '') {
+ # no messages means the server may give us more work down the road.
+ # sleep for client_timeout seconds and try the request again
+ print "Received no messages from server, waiting $opt_cs_timeout seconds\n" if ($opt_noisy);
+ sleep $opt_cs_timeout;
+ }
+ else {
+ # we got messages, so deal with them.
+ my $time_start = time;
+
+ # postdata will hold our results, real will hold the original message
+ # data from the server's scan mode.
+ %postdata = ();
+ %real = ();
+ $init_results = $total_count = $spam_count = $ham_count = 0;
+
+ # we got a result, so do things with it!
+ my $gzfd = IO::Zlib->new($result, "rb");
+ die "Can't open temp result file: $!" unless $gzfd;
+
+ # used for the temp queue file
+ my $tmppath;
+ ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
+ die "Can't make tempfile, exiting" unless $tmppath;
+ unlink $tmppath;
+
+ # if we have a temp directory, clean it out for this run
+ clean_dir($tmpdir) if ($tmpdir);
+
+ # Archive format, gzip compressed file w/ 3 parts per message:
+ # 1- server message number in text format
+ # 2- server index string, binary packed format
+ # 3- message content, if not doing paths_only
+
+ # number of messages
+ $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n";
+
+ status("server gave us $total_messages messages") if ($opt_progress);
+
+ # loop through and prep all of the messages the server sent
+ for(my $i = 0 ; $i < $total_messages; $i++ ) {
+ my $num = read_line($gzfd);
+ last unless defined $num;
+
+ my $index = read_line($gzfd);
+ last unless defined $index;
+
+ # if we're doing paths_only, there'll be no message content
+ if (!$opt_cs_paths_only) {
+ my $msg = read_line($gzfd);
+ last unless defined $msg;
+
+ # it's going to be a dir of file formatted messages
+ if (open(OUT, ">$tmpdir/$num")) {
+ print OUT $msg;
+ close(OUT);
+
+ # this is a little tricky -- we need to process the files in the
+ # path and format we've created, but the original data is needed
+ # to create a proper result later, so deal with that here.
+ my @d = Mail::SpamAssassin::ArchiveIterator::index_unpack($index);
+ $real{"$tmpdir/$num"} = \@d;
+ send_line($tmpfd,
+ Mail::SpamAssassin::ArchiveIterator::index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) ||
+ die "mass-check: error writing out temp file in client mode\n";
+ }
+ else {
+ warn "Can't create/write $tmpdir/$num: $!";
+ }
+ }
+ else {
+ # in paths_only mode, there's no kluging between formats since we're
+ # reading the same corpus, however we do still need to track server
+ # message number to message data so our results will be useable.
+ my @d = Mail::SpamAssassin::ArchiveIterator::index_unpack($index);
+ $real{$d[3]} = $num;
+ send_line($tmpfd, $index) ||
+ die "mass-check: error writing out temp file in client mode\n";
+ }
+ }
+
+ $gzfd->close;
+ unlink $result;
+
+ if ($opt_progress) {
+ status('starting run stage');
+ }
+
+ # we're about to start running, so go back to the start of the file
+ seek $tmpfd, 0, 0;
+
+ run_through_messages();
+
+ # we're done with the temp file -- bye bye
+ close($tmpfd);
+
+ # figure out new max messages, try keeping ~cs_timeout between runs
+ my $time_end = time;
+
+ # if we only requested a small number of messages, it may take <1s to
+ # run through them, so fake it and say it took 1s.
+ if ($time_end == $time_start) {
+ $time_end++;
+ }
+
+ if ($opt_progress) {
+ status('completed run stage');
+ }
+
+ print "Completed run in ".($time_end-$time_start)." seconds\n" if ($opt_noisy);
+ $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1;
+ }
+ }
+
+ # if we were using a temp dir, clean it out and then remove it
+ if ($tmpdir) {
+ clean_dir($tmpdir);
+ rmdir $tmpdir;
+ }
+}
+
+############################################################################
+
+# in server mode, just return the ref to the message data
+sub wanted_server {
+ my ($class, $id, $time, $dataref, $format) = @_;
+ return $dataref;
+}
+
+# very similar to result() except the result has the message number at the
+# front, so strip it off and then set the POST data appropriately.
+sub result_client {
+ my ($class, $result, $time) = @_;
+
+ # don't open results files until we get here to avoid overwriting files
+ init_results() if !$init_results;
+
+ if ($class eq "s") {
+ $spam_count++;
+ }
+ elsif ($class eq "h") {
+ $ham_count++;
+ }
+
+ $total_count++;
+
+ if ($opt_progress) {
+ progress($time);
+ }
+
+ if ($result =~ s/^(\d+)\s+//m) {
+ $postdata{$1} = $result;
+ }
+ else {
+ warn ">> WTH!? result is not in the correct format: $result\n";
+ }
+}
+
+sub aidbg {
+ if (would_log("dbg", "mass-check") == 2) {
+ dbg (@_);
+ }
}

Modified: spamassassin/branches/jm_re2c_hacks/masses/mboxget
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/mboxget?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/mboxget (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/mboxget Fri Oct 6 05:46:56 2006
@@ -9,11 +9,12 @@
# grep SUBJECT_FREQ spam.log | ./mboxget | grep Subject:
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs Fri Oct 6 05:46:56 2006
@@ -7,11 +7,12 @@
# for details on ROC curves.
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/overlap
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/overlap?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/overlap (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/overlap Fri Oct 6 05:46:56 2006
@@ -3,11 +3,12 @@
# overlap - print overlap between test pairs
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#

Modified: spamassassin/branches/jm_re2c_hacks/masses/parse-rules-for-masses
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/parse-rules-for-masses?view=diff&rev=453586&r1=453585&r2=453586
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/parse-rules-for-masses (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/parse-rules-for-masses Fri Oct 6 05:46:56 2006
@@ -1,11 +1,12 @@
#!/usr/bin/perl -w
#
# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#