Mailing List Archive

svn commit: r169334 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm lib/Mail/SpamAssassin/Util.pm rules/20_uri_tests.cf t/uri.t t
Author: sidney
Date: Mon May 9 08:55:06 2005
New Revision: 169334

URL: http://svn.apache.org/viewcvs?rev=169334&view=rev
Log:
bug 4176 see comments in bug report for details

Added:
spamassassin/trunk/t/uri_html.t (with props)
Modified:
spamassassin/trunk/MANIFEST
spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
spamassassin/trunk/rules/20_uri_tests.cf
spamassassin/trunk/t/uri.t

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Mon May 9 08:55:06 2005
@@ -416,6 +416,7 @@
t/stripmarkup.t
t/test_dir
t/uri.t
+t/uri_html.t
t/uri_text.t
t/utf8.t
t/whitelist_addrs.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Mon May 9 08:55:06 2005
@@ -1579,6 +1579,40 @@
}
});

+=item redirector_pattern /pattern/
+
+A regex pattern that matches both the redirector site portion, and
+the target site portion of a URI.
+
+Note: The target URI portion must be surrounded in parentheses and
+ no other part of the pattern may create a backreference.
+
+Example: http://chkpt.zdnet.com/chkpt/whatever/spammer.domain/yo/dude
+
+ redirector_pattern /^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/
+
+=cut
+
+ push (@cmds, {
+ setting => 'redirector_pattern',
+ is_priv => 1,
+ code => sub {
+ my ($self, $key, $value, $line) = @_;
+
+ # strip off the leading and trailing slashes
+ # we only ask for them to be like normal rules
+ $value =~ s/^\/(.*)\/$/$1/;
+
+ if (Mail::SpamAssassin::Conf::Parser->is_regexp_valid("redirector_pattern", $value)) {
+ # since the regexp will never change we might as well qr it
+ $value = qr/$value/i;
+
+ push @{$self->{main}->{conf}->{redirector_patterns}}, $value;
+ dbg("config: adding redirector regex: " . $value);
+ }
+ }
+ });
+
=item header SYMBOLIC_TEST_NAME header op /pattern/modifiers [if-unset: STRING]

Define a test. C<SYMBOLIC_TEST_NAME> is a symbolic test name, such as

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Mon May 9 08:55:06 2005
@@ -134,18 +134,6 @@

# add the canonified version of each uri to the detail list
if (defined $self->{uri}) {
- while(my($uri, $info) = each %{ $self->{uri} }) {
- my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($uri);
- $info->{cleaned} = \@tmp;
- # list out the URLs for debugging ...
- if (would_log('dbg', 'uri')) {
- dbg("uri: html uri found, $uri");
- foreach my $nuri (@tmp) {
- dbg("uri: cleaned html uri, $nuri");
- }
- }
- }
-
@uri = keys %{$self->{uri}};
}


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon May 9 08:55:06 2005
@@ -1864,12 +1864,29 @@
# get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
# which does the metadata stuff ... DO THIS BEFORE LOOKING FOR METADATA!!!
my @uris = $self->get_parsed_uri_list();
+ my $redirector_patterns = $self->{conf}->{redirector_patterns};
+ @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, @uris);

# get URIs from HTML parsing
# use the metadata version as $self->{html} is probably not set yet
if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
- push(@uris, @{$info->{cleaned}});
+ my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+ $info->{cleaned} = \@tmp;
+ push(@uris, @tmp);
+ if (would_log('dbg', 'uri')) {
+ dbg("uri: html uri found, $uri");
+ foreach my $nuri (@tmp) {
+ dbg("uri: cleaned html uri, $nuri");
+ }
+ }
+ }
+ }
+
+ # list out the URLs for debugging ...
+ if (would_log('dbg', 'uri')) {
+ foreach my $nuri (@uris) {
+ dbg("uri: parsed uri found: $nuri");
}
}

@@ -1953,20 +1970,10 @@
push @uris, $uri;
}
}
-
- @uris = Mail::SpamAssassin::Util::uri_list_canonify(@uris);
-
# setup the cache and return
$self->{parsed_uri_list} = \@uris;

- # list out the URLs for debugging ...
- if (would_log('dbg', 'uri')) {
- foreach my $nuri (@uris) {
- dbg("uri: parsed uri found: $nuri");
- }
- }
}
-
return @{$self->{parsed_uri_list}};
}


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Mon May 9 08:55:06 2005
@@ -196,7 +196,7 @@
# IMPORTANT: to get the html parsed into metadata, we need to call
# get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
# which does the metadata stuff ... DO THIS BEFORE SETTING $html !!!
- my @parsed = $scanner->get_parsed_uri_list();
+ my @parsed = $scanner->get_uri_list();

# Generate the full list of html-parsed domains.
my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Mon May 9 08:55:06 2005
@@ -921,7 +921,7 @@
}

sub uri_list_canonify {
- my (@uris) = @_;
+ my($redirector_patterns, @uris) = @_;

# make sure we catch bad encoding tricks
my @nuris = ();
@@ -993,6 +993,20 @@
# bug 3308: redirectors like yahoo only need one '/' ... <grrr>
if ($rest =~ m{(https?:/{0,2}.+)$}i) {
push(@uris, $1);
+ }
+
+ # resort to redirector pattern matching if the generic https? check
+ # doesn't result in a match -- bug 4176
+ else {
+ foreach (@{$redirector_patterns}) {
+ if ("$proto$host$rest" =~ $_) {
+ next unless defined $1;
+ dbg("uri: parsed uri pattern: $_");
+ dbg("uri: parsed uri found: $1 in redirector: $proto$host$rest");
+ push (@uris, $1);
+ last;
+ }
+ }
}

########################

Modified: spamassassin/trunk/rules/20_uri_tests.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/20_uri_tests.cf?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/rules/20_uri_tests.cf (original)
+++ spamassassin/trunk/rules/20_uri_tests.cf Mon May 9 08:55:06 2005
@@ -24,6 +24,10 @@

require_version @@VERSION@@

+# Redirector URI patterns
+redirector_pattern /^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/
+redirector_pattern /^(?:http://)?www\.nate\.com/r/\w+/(.*)$/
+
uri NUMERIC_HTTP_ADDR /^https?\:\/\/\d{7}/is
describe NUMERIC_HTTP_ADDR Uses a numeric IP address in URL


Modified: spamassassin/trunk/t/uri.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri.t?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/t/uri.t (original)
+++ spamassassin/trunk/t/uri.t Mon May 9 08:55:06 2005
@@ -22,7 +22,7 @@
use Mail::SpamAssassin::HTML;
use Mail::SpamAssassin::Util;

-plan tests => 77;
+plan tests => 78;

##############################################

@@ -106,7 +106,10 @@

sub try_canon {
my($input, $expect) = @_;
- my @input = sort { $a cmp $b } Mail::SpamAssassin::Util::uri_list_canonify(@{$input});
+ my $redirs = ['/^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/',
+ '/^(?:http://)?www\.nate\.com/r/\w+/(.*)$/',
+ ];
+ my @input = sort { $a cmp $b } Mail::SpamAssassin::Util::uri_list_canonify($redirs, @{$input});
my @expect = sort { $a cmp $b } @{$expect};

# output what we want/get for debugging
@@ -155,6 +158,13 @@
], [
'http%3A//ebg&vosxfov.com%2Emunged-%72xspecials%2Enet/b/Tr3f0amG',
'http://ebg&vosxfov.com.munged-rxspecials.net/b/Tr3f0amG'
+ ]));
+
+ok(try_canon([
+ 'http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39'
+ ], [
+ 'http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39',
+ 'http://www.nate.com/r/DM03/neverp4yretail.com/bam/?man=mic49',
]));

##############################################

Added: spamassassin/trunk/t/uri_html.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri_html.t?rev=169334&view=auto
==============================================================================
--- spamassassin/trunk/t/uri_html.t (added)
+++ spamassassin/trunk/t/uri_html.t Mon May 9 08:55:06 2005
@@ -0,0 +1,241 @@
+#!/usr/bin/perl -w
+
+# test URI redirecton patterns
+
+BEGIN {
+ if (-e 't/test_dir') { # if we are running "t/rule_names.t", kluge around ...
+ chdir 't';
+ }
+
+ if (-e 'test_dir') { # running from test directory, not ..
+ unshift(@INC, '../blib/lib');
+ }
+}
+
+my $prefix = '.';
+if (-e 'test_dir') { # running from test directory, not ..
+ $prefix = '..';
+}
+
+use strict;
+use SATest; sa_t_init("uri_html");
+use Test;
+use Mail::SpamAssassin;
+use IO::File;
+use vars qw(%patterns %anti_patterns);
+
+# settings
+plan tests => 2;
+
+# initialize SpamAssassin
+my $sa = create_saobj({'dont_copy_prefs' => 1});
+
+$sa->init(0); # parse rules
+
+# load tests and write mail
+my $mail = 'log/uri_html.eml';
+%patterns = ();
+%anti_patterns = ();
+write_mail();
+
+# test message
+my $fh = IO::File->new_tmpfile();
+open(STDERR, ">&=".fileno($fh)) || die "Cannot reopen STDERR";
+ok(sarun("-t --debug=uri < log/uri_html.eml"));
+seek($fh, 0, 0);
+my $error = do {
+ local $/;
+ <$fh>;
+};
+$error =~ s/^.*dbg: uri: parsed uri found: //mg;
+
+# run patterns and anti-patterns
+my $failures = 0;
+for my $pattern (keys %patterns) {
+ if ($error !~ /${pattern}/m) {
+ print "did not find $pattern\n";
+# print "found $error\n";
+ $failures++;
+ } else {
+# print "success $pattern in $error\n";
+ }
+}
+for my $anti_pattern (keys %anti_patterns) {
+ if ($error =~ /${anti_pattern}/m) {
+ print "did find $anti_pattern\n";
+ $failures++;
+ }
+}
+ok(!$failures);
+
+# function to write test email
+sub write_mail {
+ if (open(MAIL, ">$mail")) {
+ print MAIL <<'EOF';
+Message-ID: <clean.1010101@example.com>
+Date: Mon, 07 Oct 2002 09:00:00 +0000
+From: Sender <sender@example.com>
+MIME-Version: 1.0
+To: Recipient <recipient@example.com>
+Subject: this is a trivial message
+Content-Type: multipart/related;
+ boundary="--IDYGGVGT_LIYGR"
+
+----IDYGGVGT_LIYGR
+Content-Type: text/plain
+Content-Transfer-Encoding: 7bit
+
+This text part is ignored
+http://www.dontputthisinthetestdata.com
+
+----IDYGGVGT_LIYGR
+Content-Type: text/html; charset=us-ascii
+Content-Transfer-Encoding: 8bit
+
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text; charset=iso-8859-1">
+</head>
+<body>
+EOF
+ while (<DATA>) {
+ chomp;
+ next if /^#/;
+ if (/^(.*?)\t+(.*?)\s*$/) {
+ my $string = $1;
+ my @patterns = split(' ', $2);
+ if ($string && @patterns) {
+ print MAIL "<a href=$string>click here</a>\n";
+ for my $pattern (@patterns) {
+ if ($pattern =~ /^\!(.*)/) {
+ $anti_patterns{$1} = 1;
+ }
+ else {
+ $patterns{$pattern} = 1;
+ }
+ }
+ }
+ }
+ }
+ print MAIL "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
+ close(MAIL);
+ }
+ else {
+ die "can't open output file: $!";
+ }
+}
+
+# <line> : <string><tabs><matches>
+# <string> : string in the body
+# <tabs> : one or more tabs
+# <matches> : patterns expected to be found in URI output, if preceded by ! if
+# it is an antipattern, each pattern is separated by whitespace
+__DATA__
+www5.poh6feib.com poh6feib
+vau6yaer.com vau6yaer
+www5.poh6feib.info poh6feib
+Haegh3de.co.uk Haegh3de
+
+ftp.yeinaix3.co.uk ftp://ftp.yeinaix3.co.uk !http://ftp.yeinaix3.co.uk
+ftp5.riexai5r.co.uk http://ftp5.riexai5r.co.uk !ftp://ftp5.riexai5r.co.uk
+
+http://10.1.3.1/ 10.1.3.1
+
+=www.deiJ1pha.com www.deiJ1pha.com
+@www.Te0xohxu.com www.Te0xohxu.com
+.www.kuiH5sai.com www.kuiH5sai.com
+
+a=www.zaiNgoo7.com www.zaiNgoo7.com
+c.www.moSaoga8.com www.moSaoga8.com
+
+http://www.example.com/about/wahfah7d.html wahfah7d
+http://www.example.com?xa1kaLuo \?xa1kaLuo
+http://www.lap7thob.com/ ^http://www.lap7thob.com/$
+
+www.phoh1Koh.com/ ^www.phoh1Koh.com/$
+www.Tar4caeg.com:80 http://www.Tar4caeg.com:80
+www.Coo4mowe.com:80/foo/foo.html ^www.Coo4mowe.com:80/foo/foo.html
+www.Nee2quae.com:80/ ^www.Nee2quae.com:80/$
+
+HAETEI3D.com HAETEI3D
+CUK3VEIZ.us CUK3VEIZ
+CHAI7SAI.biz CHAI7SAI
+VU4YAPHU.info VU4YAPHU
+NAUVE1PH.net NAUVE1PH
+LEIX6QUU.org LEIX6QUU
+LOT1GOHV.ws LOT1GOHV
+LI4JAIZI.name LI4JAIZI
+BA1LOOXU.tv BA1LOOXU
+yiez7too.CC yiez7too
+huwaroo1.DE huwaroo1
+chohza7t.JP chohza7t
+the7zuum.BE the7zuum
+sai6bahg.AT sai6bahg
+leow3del.UK leow3del
+ba5keinu.NZ ba5keinu
+chae2shi.CN chae2shi
+roo7kiey.TW roo7kiey
+
+www.Chiew0ch.COM www.Chiew0ch.COM
+www.thohY2qu.US www.thohY2qu.US
+www.teiP7gei.BIZ www.teiP7gei.BIZ
+www.xohThai8.INFO www.xohThai8.INFO
+www.haik7Ram.NET www.haik7Ram.NET
+www.Quaes3se.ORG www.Quaes3se.ORG
+www.Chai6tah.WS www.Chai6tah.WS
+www.Thuoth1y.NAME www.Thuoth1y.NAME
+www.Chieb8ge.TV www.Chieb8ge.TV
+WWW.quus4Rok.cc WWW.quus4Rok.cc
+WWW.maic6Hei.de WWW.maic6Hei.de
+WWW.he4Hiize.jp WWW.he4Hiize.jp
+WWW.Soh1toob.be WWW.Soh1toob.be
+WWW.chahMee5.at WWW.chahMee5.at
+WWW.peepooN0.uk WWW.peepooN0.uk
+WWW.Kiox3phi.nz WWW.Kiox3phi.nz
+WWW.jong3Xou.cn WWW.jong3Xou.cn
+WWW.waeShoe0.tw WWW.waeShoe0.tw
+
+invalid_ltd.foo !invalid_tld
+invalid_ltd.bar !invalid_tld
+invalid_ltd.xyzzy !invalid_tld
+invalid_ltd.co.zz !invalid_tld
+
+www.invalid_ltd.foo !invalid_tld
+www.invalid_ltd.bar !invalid_tld
+www.invalid_ltd.xyzzy !invalid_tld
+www.invalid_ltd.co.zz !invalid_tld
+
+command.com command.com
+
+# IPs for www.yahoo.com
+http://66.94.230.33 http://66.94.230.33
+http://1113515555 http://66.94.230.35
+
+http://www.luzoop5k.com http://www.luzoop5k.com
+https://www.luzoop5k.com https://www.luzoop5k.com
+ftp://www.luzoop5k.com ftp://www.luzoop5k.com
+mailto:www.luzoop5k.com mailto:www.luzoop5k.com
+file://www.luzoop5k.com file://www.luzoop5k.com
+
+# //<user>:<password>@<host>:<port>/<url-path>
+http://user:pass@jiefeet4.com:80/x/y http://user:pass@jiefeet4.com:80/x/y
+
+puahi8si.com:80 puahi8si.com:80
+chop8tan.com:443 chop8tan.com:443
+
+ftp://name@su5queib.ca//etc/motd ftp://name@su5queib.ca//etc/motd
+ftp://name@faikaj4t.dom/%2Fetc/motd ftp://name@faikaj4t.dom//etc/motd
+
+#keyword:sportscar !sportscar
+
+# test redirector pattern
+http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39 http://neverp4yretail.com/bam/[?]man=mic49
+
+# test ignoring text portion of multipart with an html part
+http://www.nowhereinthetestdata.com !http://www.dontputhisinthetestdata.com
+
+# questionable tests
+
+mailto://cah3neun@thaihe4d.com mailto://cah3neun@thaihe4d.com
+mailto://jicu8vah@another@jicu8vah jicu8vah@another@jicu8vah
+

Propchange: spamassassin/trunk/t/uri_html.t
------------------------------------------------------------------------------
svn:executable = *