Mailing List Archive

svn commit: r178801 - in /spamassassin/branches/3.0: lib/Mail/SpamAssassin/Util.pm t/uri.t
Author: felicity
Date: Fri May 27 10:25:01 2005
New Revision: 178801

URL: http://svn.apache.org/viewcvs?rev=178801&view=rev
Log:
bug 4213: backport uri canonification/anti-obfu code from 3.1

Modified:
spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm
spamassassin/branches/3.0/t/uri.t

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm?rev=178801&r1=178800&r2=178801&view=diff
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm Fri May 27 10:25:01 2005
@@ -801,7 +801,7 @@
$uri =~ s,#.*$,,gs; # drop fragment
$uri =~ s#^[a-z]+:/{0,2}##gsi; # drop the protocol
$uri =~ s,^[^/]*\@,,gs; # username/passwd
- $uri =~ s,[/\?\&].*$,,gs; # path/cgi params
+ $uri =~ s,[/\?].*$,,gs; # path/cgi params
$uri =~ s,:\d*$,,gs; # port, bug 4191: sometimes the # is missing

return if $uri =~ /\%/; # skip undecoded URIs.
@@ -827,6 +827,7 @@
# make sure we catch bad encoding tricks
my @nuris = ();
for my $uri (@uris) {
+ # we're interested in http:// and so on, skip mailto:
next if $uri =~ /^mailto:/i;

# sometimes we catch URLs on multiple lines
@@ -836,65 +837,110 @@
$uri =~ s/^\s+//;
$uri =~ s/\s+$//;

- # Make a copy so we don't trash the original
+ # CRs just confuse things down below, so trash them now
+ $uri =~ s/\r//g;
+
+ # Make a copy so we don't trash the original in the array
my $nuri = $uri;

# http:www.foo.biz -> http://www.foo.biz
$nuri =~ s#^(https?:)/{0,2}#$1//#i;

+ # *always* make a dup with all %-encoding decoded, since
+ # important parts of the URL may be encoded (such as the
+ # scheme). (bug 4213)
+ if ($nuri =~ /\%[0-9a-fA-F]{2}/) {
+ $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
+ }
+
+ # www.foo.biz -> http://www.foo.biz
+ # unschemed URIs: assume default of "http://" as most MUAs do
+ if ($nuri !~ /^[-_a-z0-9]+:/i) {
+ if ($nuri =~ /^ftp\./) {
+ $nuri =~ s@^@ftp://@g;
+ }
+ else {
+ $nuri =~ s@^@http://@g;
+ }
+ }
+
# http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
- $nuri =~ s/^(https?:\/\/[^\/\?]+)\?/$1\/?/;
+ $nuri =~ s@^(https?://[^/?]+)\?@$1/?@i;

# deal with encoding of chars, this is just the set of printable
# chars minus ' ' (that is, dec 33-126, hex 21-7e)
$nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
- $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
+ $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-fA-F0-9]|7[0-9a-eA-E]);/sprintf "%c",hex($1)/ge;

- # deal with wierd hostname parts
- if ($nuri =~ /^(https?:\/\/)([^\/]+)(\.?\/.*)$/i) {
- my ($proto, $host, $rest) = ($1,$2,$3);
+ # put the new URI on the new list if it's different
+ if ($nuri ne $uri) {
+ push(@nuris, $nuri);
+ }
+
+ # deal with wierd hostname parts, remove user/pass, etc.
+ if ($nuri =~ m{^(https?://)([^/]+)(\/.*)?$}i) {
+ my($proto, $host, $rest) = ($1,$2,$3);
+
+ # not required
+ $rest ||= '';
+
+ # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
+ # of the URI according to RFC 1738 that's invalid, and the tested
+ # browsers (Firefox, IE) remove them before usage...
+ if ($host =~ tr/\000-\040\200-\377//d) {
+ push(@nuris, join ('', $proto, $host, $rest));
+ }
+
+ # deal with http redirectors. strip off one level of redirector
+ # and add back to the array. the foreach loop will go over those
+ # and deal appropriately.
+ # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
+ if ($rest =~ m{(https?:/{0,2}.+)$}i) {
+ push(@uris, $1);
+ }
+
+ ########################
+ ## TVD: known issue, if host has multiple combinations of the following,
+ ## all permutations will be put onto @nuris. shouldn't be an issue.
+
+ # Get rid of cruft that could cause confusion for rules...

# remove "www.fakehostname.com@" username part
- $host =~ s/^[^\@]+\@//gs;
+ if ($host =~ s/^[^\@]+\@//gs) {
+ push(@nuris, join ('', $proto, $host, $rest));
+ }
+
+ # bug 3186: If in a sentence, we might pick up odd characters ...
+ # ie: "visit http://example.biz." or "visit http://example.biz!!!"
+ # the host portion should end in some form of alpha-numeric, strip off
+ # the rest.
+ if ($host =~ s/[^0-9A-Za-z]+$//) {
+ push(@nuris, join ('', $proto, $host, $rest));
+ }

- # deal with 'http://213.172.0x1f.13/'; decode encoded octets
- if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix)
- {
+ ########################
+
+ # deal with 'http://213.172.0x1f.13/', decode encoded octets
+ if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix) {
my (@chunk) = ($1,$2,$3,$4);
for my $octet (0 .. 3) {
$chunk[$octet] =~ s/^0x([0-9a-f][0-9a-f])/sprintf "%d",hex($1)/gei;
}
- my $parsed = join ('', $proto, @chunk, $rest);
- if ($parsed ne $nuri) { push(@nuris, $parsed); }
+ push(@nuris, join ('', $proto, @chunk, $rest));
}

# "http://0x7f000001/"
- if ($host =~ /^0x[0-9a-f]+$/i) {
- $host =~ s/^0x([0-9a-f]+)/sprintf "%d",hex($1)/gei;
- $host = decode_ulong_to_ip ($host);
- my $parsed = join ('', $proto, $host, $rest);
- push(@nuris, $parsed);
+ elsif ($host =~ /^0x[0-9a-f]+$/i) {
+ # only take last 4 octets
+ $host =~ s/^0x[0-9a-f]*?([0-9a-f]{1,8})$/sprintf "%d",hex($1)/gei;
+ push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
}

# "http://1113343453/"
- if ($host =~ /^[0-9]+$/) {
- $host = decode_ulong_to_ip ($host);
- my $parsed = join ('', $proto, $host, $rest);
- push(@nuris, $parsed);
+ elsif ($host =~ /^[0-9]+$/) {
+ push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
}
- }
-
- ($nuri) = Mail::SpamAssassin::Util::url_encode($nuri);
- if ($nuri ne $uri) {
- push(@nuris, $nuri);
- }

- # deal with http redirectors. strip off one level of redirector
- # and add back to the array. the foreach loop will go over those
- # and deal appropriately.
- # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
- if ($nuri =~ m{^https?://.+?(https?:/{0,2}.+)$}i) {
- push(@uris, $1);
}
}


Modified: spamassassin/branches/3.0/t/uri.t
URL: http://svn.apache.org/viewcvs/spamassassin/branches/3.0/t/uri.t?rev=178801&r1=178800&r2=178801&view=diff
==============================================================================
--- spamassassin/branches/3.0/t/uri.t (original)
+++ spamassassin/branches/3.0/t/uri.t Fri May 27 10:25:01 2005
@@ -21,7 +21,7 @@
use Mail::SpamAssassin::HTML;
use Mail::SpamAssassin::Util;

-plan tests => 63;
+plan tests => 64;

##############################################

@@ -81,6 +81,7 @@
ok(try_domains('http:www.spamassassin.org/lists.html', 'spamassassin.org'));
ok(try_domains('http://kung.pao.com.cn', 'pao.com.cn'));
ok(try_domains('http://blah.blah.com:/', 'blah.com'));
+ok(try_domains('http://ebg&vosxfov.com.munged-rxspecials.net/b/Tr3f0amG','munged-rxspecials.net'));

##############################################