Mailing List Archive

Re: svn commit: r1880308 - /spamassassin/trunk/masses/hit-frequencies
Nice!

On Sun, Jul 26, 2020, 01:50 <hege@apache.org> wrote:

> Author: hege
> Date: Sun Jul 26 05:50:00 2020
> New Revision: 1880308
>
> URL: http://svn.apache.org/viewvc?rev=1880308&view=rev
> Log:
> Tweaks to increase speed, cut runtime in half
>
> Modified:
> spamassassin/trunk/masses/hit-frequencies
>
> Modified: spamassassin/trunk/masses/hit-frequencies
> URL:
> http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff
>
> ==============================================================================
> --- spamassassin/trunk/masses/hit-frequencies (original)
> +++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020
> @@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {
> my %overlaps_ham1r = ();
> my %overlaps_spam1r = ();
>
> - foreach my $r2 (keys %hmap_spam) {
> - next if $r1 eq $r2;
> -
> - # require that both rules have at least 1 hit
> - next unless ($freq_spam{$r1} && $freq_spam{$r2});
> -
> - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> - $hmap_spam{$r2}, $hmap_spam{$r1});
> -
> - if ($a1ina2 > 0)
> - {
> - $overlaps_spam1r{$r2} = $a1ina2;
> -
> - if (exists $overlaps_spam1{$a1ina2})
> - { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> - if (exists $overlaps_spam2{$a2ina1})
> - { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> + if ($freq_spam{$r1}) {
> + foreach my $r2 (keys %hmap_spam) {
> + next if $r1 eq $r2;
> +
> + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> + $hmap_spam{$r2}, $hmap_spam{$r1});
> +
> + if ($a1ina2 > 0)
> + {
> + $overlaps_spam1r{$r2} = $a1ina2;
> +
> + if (exists $overlaps_spam1{$a1ina2})
> + { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> + if (exists $overlaps_spam2{$a2ina1})
> + { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> + }
> }
> -
> }
>
> - foreach my $r2 (keys %hmap_ham) {
> - next if $r1 eq $r2;
> -
> - # require that both rules have at least 1 hit
> - next unless ($freq_ham{$r1} && $freq_ham{$r2});
> -
> - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> - $hmap_ham{$r2}, $hmap_ham{$r1});
> -
> - if ($a1ina2 > 0)
> - {
> - $overlaps_ham1r{$r2} = $a1ina2;
> -
> - if (exists $overlaps_ham1{$a1ina2})
> - { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> - if (exists $overlaps_ham2{$a2ina1})
> - { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> - else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> + if ($freq_ham{$r1}) {
> + foreach my $r2 (keys %hmap_ham) {
> + next if $r1 eq $r2;
> +
> + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> + $hmap_ham{$r2}, $hmap_ham{$r1});
> +
> + if ($a1ina2 > 0)
> + {
> + $overlaps_ham1r{$r2} = $a1ina2;
> +
> + if (exists $overlaps_ham1{$a1ina2})
> + { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> + if (exists $overlaps_ham2{$a2ina1})
> + { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> + else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> + }
> }
> -
> }
>
> _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam",
> \%overlaps_ham1r, "ham");
> @@ -934,25 +930,23 @@ sub _prettify_overlap_rules {
> sub _hmap_to_overlap_ratio {
> my ($r1, $r2, $hmap1, $hmap2) = @_;
>
> - $hmap1 ||= '';
> - $hmap2 ||= '';
> - if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {
> - # no hits on either! this would normally give a 100% hitrate match,
> - # but that's misleading -- so hide it by giving it a 0% overlap.
> - #
> - # also, ignore cases where there are no hits on *one* of the rules,
> - # while there are hits on the other -- after all, if one rule doesn't
> - # have a single hit, it cannot overlap.
> - #
> - return (0,0);
> - }
> -
> # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print
> vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) {
> print vec($hmap2,$i,1); } print "\n";
>
> # count bits in each, so we can show when one is fully subsumed by
> another
> # with perl's support for bitstring ops, we get C speed here, nice!
> +
> + # no hits on either? this would normally give a 100% hitrate match,
> + # but that's misleading -- so hide it by giving it a 0% overlap.
> + #
> + # also, ignore cases where there are no hits on *one* of the rules,
> + # while there are hits on the other -- after all, if one rule doesn't
> + # have a single hit, it cannot overlap.
> +
> my $a1 = unpack("%32b*", $hmap1);
> + return (0,0) unless $a1;
> my $a2 = unpack("%32b*", $hmap2);
> + return (0,0) unless $a2;
> +
> my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));
>
> # round rather than truncate
>
>
>