Mailing List Archive

svn commit: r485513 - /spamassassin/trunk/masses/overlap
Author: duncf
Date: Sun Dec 10 21:43:14 2006
New Revision: 485513

URL: http://svn.apache.org/viewvc?view=rev&rev=485513
Log:
Clean up overlap and document

Modified:
spamassassin/trunk/masses/overlap

Modified: spamassassin/trunk/masses/overlap
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/overlap?view=diff&rev=485513&r1=485512&r2=485513
==============================================================================
--- spamassassin/trunk/masses/overlap (original)
+++ spamassassin/trunk/masses/overlap Sun Dec 10 21:43:14 2006
@@ -1,7 +1,5 @@
#!/usr/bin/perl -w

-# overlap - print overlap between test pairs
-#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -19,23 +17,41 @@
# limitations under the License.
# </@LICENSE>

-use vars qw($opt_a $opt_h $opt_t);
-use Getopt::Std;
-getopts("aht");
+use strict;
+use warnings;

-my $prog = $0;
-$prog =~ s@.*/@@;
+use vars qw($opt_a $opt_t);
+use Getopt::Long qw(:config auto_help bundling);
+
+GetOptions("a|all" => \$opt_a,
+ "t|ignore" => \$opt_t);
+
+=head1 NAME
+
+overlap - Tool to help determine which tests overlap significantly
+
+=head1 SYNOPSIS
+
+overlap [options] <log file>
+
+ Options:
+ -a,--all Show all entries (including reverses of pairs)
+ -t,--ignore Ignore T_ tests (rules under testing)

-sub usage {
- my $status = shift;
+=head1 DESCRIPTION

- my $out = $status ? STDERR : STDOUT;
- print $out <<EOF;
-usage: $prog [options] [mass-check results files]
-
- -a show all entries (normally, reverses of pairs are not shown)
- -h print this help
- -t ignore T_ tests
+B<overlap> will read the mass-check results log specified and output
+pairs of tests and how frequently they occur together in absolute
+terms, and relative to their individual hit rates.
+
+The output is of the form:
+
+ COUNT PAIR/A PAIR/B A,B
+
+where C<COUNT> is the number of times the tests hit on the same
+message, C<PAIR/A> is the ratio of times that both test hit to the
+number of times test A hits, C<PAIR/B> is the ratio of pair hits to B
+hits, and the C<A,B> column shows the names of the two tests.

Do not abuse this tool. Just because a test highly correlates with
another test does not mean you can simply remove one or merge them
@@ -44,11 +60,11 @@
Some overlap is often good, especially if the tests have different
characteristics.

-EOF
- exit($status);
-}
+=cut

-usage(0) if $opt_h;
+
+my $prog = $0;
+$prog =~ s@.*/@@;

if ($#ARGV < 0) {
push(@ARGV, "-");
@@ -57,13 +73,13 @@
my %solo;
my %pair;

-foreach $file (@ARGV) {
+foreach my $file (@ARGV) {
read_file($file);
}

print "COUNT\tPAIR/A\tPAIR/B\tA,B\n";

-foreach $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
+foreach my $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
my ($a, $b) = split(/ /, $k);
my $a_pct = $pair{$k} / $solo{$a};
my $b_pct = $pair{$k} / $solo{$b};