Mailing List Archive

svn commit: rev 36128 - spamassassin/trunk/masses
Author: quinlan
Date: Mon Aug 9 13:19:31 2004
New Revision: 36128

Modified:
spamassassin/trunk/masses/rewrite-cf-with-new-scores
Log:
bug 3627: rewrite to fix various bugs and improve things in general


Modified: spamassassin/trunk/masses/rewrite-cf-with-new-scores
==============================================================================
--- spamassassin/trunk/masses/rewrite-cf-with-new-scores (original)
+++ spamassassin/trunk/masses/rewrite-cf-with-new-scores Mon Aug 9 13:19:31 2004
@@ -20,144 +20,201 @@

my $NUM_SCORESETS = 4;

-my ($scoreset,$oldscores,$newscores) = @ARGV;
+# scores are broken into three regions:
+# 1. "pre" (stuff before generated mutable scores)
+# 2. "gen" (generated mutable scores)
+# 3. "end" (stuff after generated mutable scores)

+# options
+my ($scoreset, $oldscores, $newscores) = @ARGV;
$scoreset = int($scoreset) if defined $scoreset;
if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) {
die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";
}

-system ("./parse-rules-for-masses -s $scoreset") and die;
-our %rules;
-if (-e "tmp/rules.pl") {
- # note: the spaces need to stay in front of the require to work around
- # a RPM 4.1 problem
- require "./tmp/rules.pl";
-}
-else {
- die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
-}
-
-# now read the generated scores
-my @gascoreorder = ();
-my %gascorelines = ();
-open (STDIN, "<$newscores") or die "cannot open $newscores";
-while (<STDIN>) {
- /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
- my $name = $1;
- my $score = $2;
- next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
- next if ($name =~ /^__/);
- next if ($name eq '(null)'); # er, oops ;)
-
- $gascorelines{$name} = $score;
- push (@gascoreorder, $name);
-}
-
-open (IN, "<$oldscores") or die "cannot open $oldscores";
-my $out = '';
-my $pre = '';
-my %seenscoreforrule = (
- 'AWL' => 1, # dynamic rule, skip it
-);
-
-# read until '# Start of generated scores', removing scores from our
-# new list if we come across them.
-while (<IN>) {
- if (/^\s*score\s+(\S+)\s/) {
- my $name = $1;
- $seenscoreforrule{$name} = 1;
- delete $gascorelines{$name};
- next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
- }
- $pre .= $_;
- /^# Start of generated scores/ and last;
-}
+# variables filled-out in read_rules()
+our %rules; # rules data

-# now skip until '# End of generated scores'
-my %oldscores = ();
-while (<IN>) {
- if (/^\s*score\s+\S+/) {
- my($score,$name,@scores) = split;
- @{$oldscores{$name}} = @scores;
- $seenscoreforrule{$name} = 1;
- }
+# variables filled-out in read_gascores()
+my %gascores = (); # generated scores

- /^# End of generated scores/ and last;
-}
-if (defined $_) {
- $out .= $_;
+# variables filled-out in read_oldscores()
+my $pre = ''; # stuff before "gen" scores
+my $end = ''; # stuff after "gen" scores
+my %oldscores; # old scores
+my %comment; # "gen" rule comments
+my %fixed; # scores that are fixed (non-gen)
+
+# read stuff in
+read_rules();
+read_gascores();
+read_oldscores();
+
+# write stuff out
+print $pre;
+print_gen();
+print $end;
+
+sub read_rules {
+ system ("./parse-rules-for-masses -s $scoreset") and die;
+ if (-e "tmp/rules.pl") {
+ # note: the spaces need to stay in front of the require to work around
+ # a RPM 4.1 problem
+ require "./tmp/rules.pl";
+ }
+ else {
+ die "parse-rules-for-masses had no error but no tmp/rules.pl";
+ }
}

-# and read until EOF, again removing scores from our list as we find 'em.
-while (<IN>) {
- if (/^\s*score\s+\S+/) {
- my($score,$name,@scores) = split;
-
- next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
- if (defined $gascorelines{$name}) {
- # Set appropriate scoreset value
- $scores[$scoreset] = $gascorelines{$name};
+sub read_gascores {
+ open (STDIN, "<$newscores") or die "cannot open $newscores";
+ while (<STDIN>) {
+ next unless /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/;
+ my $name = $1;
+ my $score = $2;

- # Create new score line
- $_ = join(" ","score",$name,generate_scores($name, @scores))."\n";
+ # various things we should be concerned about
+ if (!exists $rules{$name}) {
+ warn "$name is not defined in tmp/rules.pl\n";
+ next;
+ }
+ if ($rules{$name}->{issubrule}) {
+ warn "$name is an indirect sub-rule in tmp/rules.pl\n";
+ next;
}
- delete $gascorelines{$name};
- $seenscoreforrule{$name} = 1;
+ if ($rules{$name} =~ /^__/) {
+ warn "$name has an indirect sub-rule \"__\" prefix\n";
+ next;
+ }
+ if ($name eq '(null)') {
+ warn "$name is (null)\n";
+ next;
+ }
+
+ $gascores{$name} = $score;
}
- $out .= $_;
}
-close IN;

-# and output the lot
-print $pre, "\n";
-foreach my $name (@gascoreorder) {
- $_ = $gascorelines{$name};
- next unless (defined ($_));
+sub read_oldscores {
+ open (IN, "<$oldscores") or die "cannot open $oldscores";

- next if ($rules{$name}->{lang}); # "lang es" rules etc.
- next if ($name eq 'AWL'); # dynamic score
+ # state of things
+ my $where = "pre"; # region of original scores file that we're in

- # Use the old scores if they existed
- my @scores = ();
- @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
+ # read everything in
+ while (my $line = <IN>) {
+ if ($line =~ /<\/gen:mutable>/) {
+ $where = "end";
+ }

- # Set appropriate scoreset value
- $scores[$scoreset] = $_;
- delete $oldscores{$name};
- $seenscoreforrule{$name} = 1;
+ if ($where eq "pre") {
+ readline_fix($line);
+ $pre .= $line;
+ }
+ elsif ($where eq "gen") {
+ readline_gen($line);
+ }
+ elsif ($where eq "end") {
+ readline_fix($line);
+ $end .= $line;
+ }

- # Create new score line
- print join(" ","score",$name,generate_scores($name, @scores)),"\n";
+ if ($line =~ /<gen:mutable>/) {
+ $where = "gen";
+ }
+ }
}

-# output any tests that were in the old scores file, but not in
-# the GA output
-# TODO: currently, I'm assuming that if the GA didn't use it,
-# we must not assign the rule a score.
-foreach my $name (sort keys %oldscores) {
- delete $oldscores{$name};
- $seenscoreforrule{$name} = 1;
- next if ($rules{$name}->{lang}); # "lang es" rules etc.
-
- # my @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
- my @scores = (0, 0, 0, 0);
- print join(" ","score",$name,generate_scores($name, @scores)),"\n";
+# used for both "pre" and "end"
+sub readline_fix {
+ my ($line) = @_;
+
+ my $comment;
+ if ($line =~ s/\s*#\s*(.*)//) {
+ $comment = $1;
+ }
+ if ($line =~ /^\s*score\s+(\S+)\s/) {
+ my (undef, $name, @scores) = split(' ', $line);
+ $fixed{$name}++;
+ $comment{$name} = $comment if $comment;
+ }
}

-# now do the same for what's in tmp/rules.pl
-foreach my $name (sort keys %rules) {
- next if ($seenscoreforrule{$name});
- next if ($rules{$name}->{issubrule});
- next if ($rules{$name}->{lang}); # "lang es" rules etc.
- delete $oldscores{$name};
+sub readline_gen {
+ my ($line) = @_;

- # my @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
- my @scores = (0, 0, 0, 0);
- print join(" ","score",$name,generate_scores($name, @scores)),"\n";
+ my $comment;
+ if ($line =~ s/\s*#\s*(.*)//) {
+ $comment = $1;
+ $comment =~ s/ n=$scoreset//;
+ }
+ if ($line =~ /^\s*score\s+(\S+)\s/) {
+ my (undef, $name, @scores) = split(' ', $line);
+ for (my $i = 1; $i < $NUM_SCORESETS; $i++) {
+ $scores[$i] = $scores[0] unless defined $scores[$i];
+ }
+ @{$oldscores{$name}} = @scores;
+ $comment{$name} = $comment if $comment;
+ }
}

-print "\n", $out;
+sub print_gen {
+ print "\n";
+
+ # we just consider scores for this set that are in the input or were in the
+ # "gen" region from the old scores, tmp/rules.pl is not considered here
+ my %gen; # rules to be printed in "gen" region
+ $gen{$_} = 1 for keys %gascores; # scores for this set from GA
+ $gen{$_} = 1 for keys %oldscores; # original scores in "gen" region
+
+ # remove fixed scores
+ for (keys %fixed) {
+ delete $gen{$_};
+ }
+
+ # sort all generated rules by name
+ for my $name (sort keys %gen) {
+ next if ($rules{$name}->{lang}); # "lang es" rules etc.
+ next if ($rules{$name}->{issubrule}); # indirect sub-rules
+ next if ($name eq 'AWL'); # dynamic score
+
+ my @scores = ();
+ my $comment = '';
+ $comment = $comment{$name} if defined $comment{$name};
+
+ # use the old scores if they existed
+ @scores = @{$oldscores{$name}} if exists $oldscores{$name};
+
+ # set appropriate scoreset value
+ if (defined $gascores{$name}) {
+ $scores[$scoreset] = $gascores{$name};
+ delete $oldscores{$name};
+ }
+ else {
+ # I think these are non-issues
+ if (defined $rules{$name}->{score} && !$rules{$name}->{issubrule}) {
+ #warn "$name has no GA score, but is in tmp/rules.pl\n";
+ }
+ if (defined $oldscores{$name}) {
+ $comment .= " n=$scoreset";
+ #warn "$name has no GA score, but had a score before\n";
+ }
+ }
+
+ # sort and unique comment tags
+ my %unique;
+ $unique{$_} = 1 for split(' ', $comment);
+ $comment = join(' ', sort keys %unique);
+
+ # create new score line
+ printf("score %s %s%s\n", $name,
+ join(" ", generate_scores($name, @scores)),
+ ($comment) ? ' # ' . $comment : '');
+ }
+
+ print "\n";
+}

sub generate_scores {
my ($name, @scores) = @_;
@@ -169,25 +226,31 @@
$islearn = ($rules{$name}->{tflags} =~ /\blearn\b/);
}

- # Set defaults if not already set
- $scores[0] ||= 0;
+ # set defaults if not already set
+ if (!defined $scores[0]) {
+ warn "$name does not have a default score\n";
+ $scores[0] ||= 0;
+ }
+
+ # zero for current scoreset if there was no new score
+ $scores[$scoreset] = 0 if !$gascores{$name};

my $flag = 1;
- for(my $i=1;$i<$NUM_SCORESETS;$i++) {
+ for (my $i = 1; $i < $NUM_SCORESETS; $i++) {
$scores[$i] = $scores[0] unless defined $scores[$i];
- $flag = 0 if ( $scores[$i] != $scores[$i-1] );
+ $flag = 0 if ($scores[$i] != $scores[$i-1]);
};

# enforce rule/scoreset rules.
# net rules never have a non-zero score in sets 0 and 2
- for(my $i=0;$i<$NUM_SCORESETS;$i++) {
+ for (my $i = 0; $i < $NUM_SCORESETS; $i++) {
if ($isnet && ($i & 1) == 0) {
$scores[$i] = 0;
- $flag = 0 if ( $i > 0 && $scores[$i] != $scores[$i-1] );
+ $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]);
}
if ($islearn && ($i & 2) == 0) {
$scores[$i] = 0;
- $flag = 0 if ( $i > 0 && $scores[$i] != $scores[$i-1] );
+ $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]);
}
}