Mailing List Archive

svn commit: r169564 - /spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
Author: felicity
Date: Tue May 10 19:42:49 2005
New Revision: 169564

URL: http://svn.apache.org/viewcvs?rev=169564&view=rev
Log:
even more URI parsing work. add a new get_uri_detail_list() which includes the parsed uris in the uri_detail hash. base get_uri_list off of that, etc.

Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169564&r1=169563&r2=169564&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May 10 19:42:49 2005
@@ -1881,20 +1881,12 @@
return @{$self->{uri_list}};
}

- # IMPORTANT: to get the html parsed into metadata, we need to call
- # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
- # which does the metadata stuff ... DO THIS BEFORE LOOKING FOR METADATA!!!
- my @uris = $self->get_parsed_uri_list();
-
- # We need the Metadata extracted to get the canonified HTML parsed URIs
- $self->extract_message_metadata();
+ my @uris = ();

# get URIs from HTML parsing
- if (defined $self->{html}->{uri_detail}) {
- while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
- if ($info->{cleaned}) {
- push(@uris, @{$info->{cleaned}});
- }
+ while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
+ if ($info->{cleaned}) {
+ push(@uris, @{$info->{cleaned}});
}
}

@@ -1916,7 +1908,46 @@
return @uris;
}

-sub get_parsed_uri_list {
+sub get_uri_detail_list {
+ my ($self) = @_;
+
+ # use cached answer if available
+ if (defined $self->{uri_detail_list}) {
+ return $self->{uri_detail_list};
+ }
+
+ # IMPORTANT: to get the html parsed into metadata, we need to
+ # call extract_message_metadata(). It will call
+ # get_decoded_stripped_body_text_array(), if necessary which does
+ # the metadata->html stuff ...
+
+ # We need the Metadata extracted to get the canonified HTML parsed URIs
+ $self->extract_message_metadata();
+
+ # get URIs from HTML parsing
+ my $detail = $self->{html}->{uri_detail} || { };
+
+ foreach my $uri ( $self->_get_parsed_uri_list() ) {
+ my @uris = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns}, $uri);
+
+ # list out the URLs for debugging ...
+ if (would_log('dbg', 'uri')) {
+ foreach my $nuri (@uris) {
+ dbg("uri: parsed uri found: $nuri");
+ }
+ }
+
+ $detail->{$uri}->{types}->{parsed} = 1;
+ push(@{$detail->{$uri}->{cleaned}}, @uris);
+ }
+
+ # setup the cache
+ $self->{uri_detail_list} = $detail;
+
+ return $detail;
+}
+
+sub _get_parsed_uri_list {
my ($self) = @_;

# use cached answer if available
@@ -1979,19 +2010,10 @@
}
}

- @uris = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns}, @uris);
-
- # list out the URLs for debugging ...
- if (would_log('dbg', 'uri')) {
- foreach my $nuri (@uris) {
- dbg("uri: parsed uri found: $nuri");
- }
- }
-
# setup the cache and return
$self->{parsed_uri_list} = \@uris;
-
}
+
return @{$self->{parsed_uri_list}};
}


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=169564&r1=169563&r2=169564&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Tue May 10 19:42:49 2005
@@ -185,16 +185,14 @@

$self->setup ($scanstate);

+
# get all domains in message

# list of arrays to use in order
my @uri_ordered = ();

- # use the parsed uris from the rendered message text
- my @parsed = $scanner->get_parsed_uri_list();
-
# Generate the full list of html-parsed domains.
- my $html = $scanner->{html}->{uri_detail} || { };
+ my $uris = $scanner->get_uri_detail_list();

# go from uri => info to uri_ordered
# 0: a
@@ -203,11 +201,7 @@
# 3: !a_empty
# 4: parsed
# 5: a_empty
- if (@parsed) {
- $uri_ordered[4] = \@parsed;
- }
-
- while (my($uri, $info) = each %{$html}) {
+ while (my($uri, $info) = each %{$uris}) {
my $entry = 3;

if ($info->{types}->{a}) {
@@ -226,6 +220,9 @@
}
elsif ($info->{types}->{img}) {
$entry = 2;
+ }
+ elsif (@{$info->{types}} == 1 && $info->{types}->{parsed}) {
+ $entry = 4;
}

push(@{$uri_ordered[$entry]}, @{$info->{cleaned}});