Mailing List Archive

svn commit: r169506 - in /spamassassin/trunk: lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm t/uri_html.t
Author: felicity
Date: Tue May 10 11:36:02 2005
New Revision: 169506

URL: http://svn.apache.org/viewcvs?rev=169506&view=rev
Log:
move the URI canonification around some more. get_uri_list() now just puts together canonfied parsed and html uri lists, html canonification happens in extract_metadata, extract_metadata only runs once now and is called multiple times whenever canonfied html uris are needed. modified around t/uri_html.t to not need a temp file, just run through get_uri_list() internally and not call a full SA, and avoid the whole debug output reading bit.

Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
spamassassin/trunk/t/uri_html.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May 10 11:36:02 2005
@@ -145,6 +145,10 @@
$self->{conf}->set_score_set ($set|2);
}

+ # Do this here so that {metadata}->{html} gets set, which we then reference
+ # in extract_message_metadata()
+ my $decoded = $self->get_decoded_stripped_body_text_array();
+
$self->extract_message_metadata();

{
@@ -153,15 +157,6 @@
$self->run_rbl_eval_tests ($self->{conf}->{rbl_evals});
my $needs_dnsbl_harvest_p = 1; # harvest needs to be run

- my $decoded = $self->get_decoded_stripped_body_text_array();
-
- # this has been put on the metadata object. we could use it
- # directly, but $self->{msg}->{metadata}->{html} goes through a lot
- # of referencing ...
- # NOTE: this has to come after get_decoded_stripped_body_text_array() as it's
- # the one that sets {metadata}->{html} ...
- $self->{html} = $self->{msg}->{metadata}->{html};
-
my $bodytext = $self->get_decoded_body_text_array();
my $fulltext = $self->{msg}->get_pristine();
my @uris = $self->get_uri_list();
@@ -1297,6 +1292,10 @@

sub extract_message_metadata {
my ($self) = @_;
+
+ # Use $self->{html} as a flag indicating whether or not we've already
+ # extracted the metadata.
+ return if ($self->{html});

$self->{msg}->extract_message_metadata($self->{main});

@@ -1313,6 +1312,26 @@
$self->{tag_data}->{RELAYSUNTRUSTED} = $self->{relays_untrusted_str};
$self->{tag_data}->{LANGUAGES} = $self->{msg}->get_metadata("X-Languages");

+ # NOTE: this has to come after get_decoded_stripped_body_text_array() as it's
+ # the one that sets {metadata}->{html} ... it should be called before
+ # extract_message_metadata() ...
+ $self->{html} = $self->{msg}->{metadata}->{html};
+
+ # canonify the HTML parsed URIs
+ my $redirector_patterns = $self->{conf}->{redirector_patterns};
+ if (defined $self->{html}->{uri_detail}) {
+ while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
+ my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+ $info->{cleaned} = \@tmp;
+ if (would_log('dbg', 'uri')) {
+ dbg("uri: html uri found, $uri");
+ foreach my $nuri (@tmp) {
+ dbg("uri: cleaned html uri, $nuri");
+ }
+ }
+ }
+ }
+
# allow plugins to add more metadata, read the stuff that's there, etc.
$self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
}
@@ -1864,32 +1883,19 @@
# get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
# which does the metadata stuff ... DO THIS BEFORE LOOKING FOR METADATA!!!
my @uris = $self->get_parsed_uri_list();
- my $redirector_patterns = $self->{conf}->{redirector_patterns};
- @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, @uris);
+
+ # We need the Metadata extracted to get the canonified HTML parsed URIs
+ $self->extract_message_metadata();

# get URIs from HTML parsing
- # use the metadata version as $self->{html} is probably not set yet
- if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
- while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
- my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
- $info->{cleaned} = \@tmp;
- push(@uris, @tmp);
- if (would_log('dbg', 'uri')) {
- dbg("uri: html uri found, $uri");
- foreach my $nuri (@tmp) {
- dbg("uri: cleaned html uri, $nuri");
- }
+ if (defined $self->{html}->{uri_detail}) {
+ while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
+ if ($info->{cleaned}) {
+ push(@uris, @{$info->{cleaned}});
}
}
}

- # list out the URLs for debugging ...
- if (would_log('dbg', 'uri')) {
- foreach my $nuri (@uris) {
- dbg("uri: parsed uri found: $nuri");
- }
- }
-
# get domain list
$self->{redirect_num} = 0;
my %domains;
@@ -1970,6 +1976,16 @@
push @uris, $uri;
}
}
+
+ @uris = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns}, @uris);
+
+ # list out the URLs for debugging ...
+ if (would_log('dbg', 'uri')) {
+ foreach my $nuri (@uris) {
+ dbg("uri: parsed uri found: $nuri");
+ }
+ }
+
# setup the cache and return
$self->{parsed_uri_list} = \@uris;


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Tue May 10 11:36:02 2005
@@ -186,20 +186,18 @@
$self->setup ($scanstate);

# get all domains in message
- # TODO! we need a method that provides more metadata about where
- # the URI was found so we can ignore hammy decoys.

# list of arrays to use in order
my @uri_ordered = ();

# use the parsed uris from the rendered message text
- # IMPORTANT: to get the html parsed into metadata, we need to call
- # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
- # which does the metadata stuff ... DO THIS BEFORE SETTING $html !!!
- my @parsed = $scanner->get_uri_list();
+ my @parsed = $scanner->get_parsed_uri_list();
+
+ # We need the Metadata extracted to get the canonified HTML parsed URIs
+ $scanner->extract_message_metadata();

# Generate the full list of html-parsed domains.
- my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };
+ my $html = $scanner->{html}->{uri_detail} || { };

# go from uri => info to uri_ordered
# 0: a

Modified: spamassassin/trunk/t/uri_html.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri_html.t?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/t/uri_html.t (original)
+++ spamassassin/trunk/t/uri_html.t Tue May 10 11:36:02 2005
@@ -21,7 +21,6 @@
use SATest; sa_t_init("uri_html");
use Test;
use Mail::SpamAssassin;
-use IO::File;
use vars qw(%patterns %anti_patterns);

# settings
@@ -33,35 +32,28 @@
$sa->init(0); # parse rules

# load tests and write mail
-my $mail = 'log/uri_html.eml';
%patterns = ();
%anti_patterns = ();
-write_mail();
+my $message = write_mail();

-# test message
-my $fh = IO::File->new_tmpfile();
-open(STDERR, ">&=".fileno($fh)) || die "Cannot reopen STDERR";
-ok(sarun("-t --debug=uri < log/uri_html.eml"));
-seek($fh, 0, 0);
-my $error = do {
- local $/;
- <$fh>;
-};
-$error =~ s/^.*dbg: uri: parsed uri found: //mg;
+my $mail = $sa->parse($message);
+my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+my $uris = join("\n", $msg->get_uri_list(), "");

# run patterns and anti-patterns
my $failures = 0;
for my $pattern (keys %patterns) {
- if ($error !~ /${pattern}/m) {
+ if ($uris !~ /${pattern}/m) {
print "did not find $pattern\n";
-# print "found $error\n";
$failures++;
- } else {
-# print "success $pattern in $error\n";
}
}
+ok(!$failures);
+$failures = 0;
+
for my $anti_pattern (keys %anti_patterns) {
- if ($error =~ /${anti_pattern}/m) {
+ if ($uris =~ /${anti_pattern}/m) {
print "did find $anti_pattern\n";
$failures++;
}
@@ -70,8 +62,7 @@

# function to write test email
sub write_mail {
- if (open(MAIL, ">$mail")) {
- print MAIL <<'EOF';
+ my $msg = <<'EOF';
Message-ID: <clean.1010101@example.com>
Date: Mon, 07 Oct 2002 09:00:00 +0000
From: Sender <sender@example.com>
@@ -98,31 +89,29 @@
</head>
<body>
EOF
- while (<DATA>) {
- chomp;
- next if /^#/;
- if (/^(.*?)\t+(.*?)\s*$/) {
- my $string = $1;
- my @patterns = split(' ', $2);
- if ($string && @patterns) {
- print MAIL "<a href=$string>click here</a>\n";
- for my $pattern (@patterns) {
- if ($pattern =~ /^\!(.*)/) {
- $anti_patterns{$1} = 1;
- }
- else {
- $patterns{$pattern} = 1;
- }
+
+ while (<DATA>) {
+ chomp;
+ next if /^#/;
+ if (/^(.*?)\t+(.*?)\s*$/) {
+ my $string = $1;
+ my @patterns = split(' ', $2);
+ if ($string && @patterns) {
+ $msg .= qq@<a href="$string">click here</a>\n@;
+ for my $pattern (@patterns) {
+ if ($pattern =~ /^\!(.*)/) {
+ $anti_patterns{$1} = 1;
+ }
+ else {
+ $patterns{$pattern} = 1;
}
}
}
}
- print MAIL "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
- close(MAIL);
- }
- else {
- die "can't open output file: $!";
}
+ $msg .= "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
+
+ return $msg;
}

# <line> : <string><tabs><matches>
@@ -238,4 +227,3 @@

mailto://cah3neun@thaihe4d.com mailto://cah3neun@thaihe4d.com
mailto://jicu8vah@another@jicu8vah jicu8vah@another@jicu8vah
-