Mailing List Archive

svn commit: rev 6502 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin
Author: felicity
Date: Wed Feb 4 14:28:34 2004
New Revision: 6502

Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
Log:
bug 3000: make check_attachments() use the mime tree instead of trying
to figure out what is going on by itself.


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Wed Feb 4 14:28:34 2004
@@ -2356,7 +2356,9 @@
sub _check_mime_header {
my ($self, $ctype, $cte, $cd, $charset, $name) = @_;

- if ($ctype =~ m@^text/html@i) {
+ $charset ||= '';
+
+ if ($ctype eq 'text/html') {
$self->{mime_body_html_count}++;
}
elsif ($ctype =~ m@^(?:text|message)@i) {
@@ -2393,7 +2395,7 @@
$self->{mime_qp_inline_no_charset} = 1;
}

- if ($ctype =~ /^text\/html/ &&
+ if ($ctype eq 'text/html' &&
!(defined($charset) && $charset) &&
!($cd && $cd =~ /^(?:attachment|inline)/))
{
@@ -2448,11 +2450,8 @@
sub _check_attachments {
my ($self) = @_;

- my $previous = 'undef'; # the previous line
-
# MIME status
my $where = -1; # -1 = start, 0 = nowhere, 1 = header, 2 = body
- my @boundary; # list of MIME boundaries
my %state; # state of each MIME part
my $qp_bytes = 0; # total bytes in QP regions
my $qp_count = 0; # QP-encoded bytes in QP regions
@@ -2460,18 +2459,9 @@
my @part_type; # MIME part types

# MIME header information
- my $ctype = 0; # Content-Type
- my $cte = 0; # Content-Transfer-Encoding
- my $cd = 0; # Content-Disposition
- my $charset = 0; # charset
- my $name = 0; # name or filename
my $part = -1; # MIME part index

# regular expressions
- my $re_boundary = qr/\bboundary\s*=\s*["']?(.*?)["']?(?:;|$)/i;
- my $re_charset = qr/\bcharset\s*=\s*["']?(.*?)["']?(?:;|$)/i;
- my $re_name = qr/name\s*=\s*["']?(.*?)["']?(?:;|$)/i;
- my $re_ctype = qr/^Content-Type:\s*(.+?)(?:;|\s|$)/i;
my $re_cte = qr/^Content-Transfer-Encoding:\s*(.+)/i;
my $re_cd = qr/^Content-Disposition:\s*(.+)/i;

@@ -2500,115 +2490,89 @@
$self->{mime_qp_ratio} = 0;
$self->{mime_suspect_name} = 0;

- # message headers
- $ctype = $self->get('Content-Type');
- $cte = $self->get('Content-Transfer-Encoding');
- $cd = $self->get('Content-Disposition');
- chomp($cte = defined($cte) ? lc($cte) : "");
- if ($ctype =~ /$re_boundary/m && $1 ne '') {
- push (@boundary, "\Q$1\E");
- }
- if ($ctype =~ /^multipart\/alternative/i) {
- $self->{mime_multipart_alternative} = 1;
- }
-
- # check MIME headers in message header
- if ($ctype =~ /$re_charset/) { $charset = lc($1); }
- if ($ctype =~ /$re_name/) { $name = lc($1); }
- if ($ctype =~ /$re_ctype/) { $ctype = lc($1); }
- if ($cte =~ /$re_cte/) { $cte = lc($1); }
- if ($cd =~ /$re_cd/) { $cd = lc($1); }
- $self->_check_mime_header($ctype, $cte, $cd, $charset, $name);
-
- # Note: We don't use rawbody because it removes MIME parts. Instead,
- # we get the raw unfiltered body. We must not change any lines and
- # we might see some SpamAssassin mark-up.
- foreach my $line (@{$self->{msg}->get_body()}) {
- $_ = $line; # copy to preserve originals
- s/\r$//; # trim CRs, we don't want them
-
- if (/^--/) {
- foreach my $boundary (@boundary) {
- if (/^--$boundary$/) {
- $state{$boundary} = 1;
- $ctype = $cte = $cd = $charset = $name = 0;
- $where = 1;
- }
- if (/^--$boundary--$/) {
- $state{$boundary}--;
- $where = 0;
- }
- }
+ # Get all parts ...
+ foreach my $p ( $self->{msg}->find_parts(qr/./) ) {
+ # message headers
+ my($ctype, $boundary, $charset, $name) = Mail::SpamAssassin::Util::parse_content_type($p->get_header("content-type"));
+
+ if ($ctype eq 'multipart/alternative') {
+ $self->{mime_multipart_alternative} = 1;
}
- if ($where == 2) {
- if ($previous =~ /^$/ && /^TV[pq]QAA[MI]AAAAEAA[8A]A/) {
- $self->{microsoft_executable} = 1;
- }
- if ($cte =~ /base64/ && $previous =~ /^\s*$/ && /^\s*$/) {
- $self->{mime_base64_blanks} = 1;
- }
- if ($cte =~ /base64/ && (m@[^A-Za-z0-9+/=\n]@ || m/=[^=\s]/)) {
- $self->{mime_base64_illegal} = 1;
+
+ my $cte = $self->get('Content-Transfer-Encoding');
+ if ($cte =~ /$re_cte/) { $cte = lc($1); }
+ chomp($cte = defined($cte) ? $cte : "");
+
+ my $cd = $self->get('Content-Disposition');
+ if ($cd =~ /$re_cd/) { $cd = lc($1); }
+ chomp($cd = defined($cd) ? $cd : "");
+
+ $self->_check_mime_header($ctype, $cte, $cd, $charset, $name);
+
+ # If we're in the root node of the MIME tree, let's skip the rest of the tests ...
+ if ( $p->is_root() ) {
+ next;
+ }
+
+ $part++;
+ $part_type[$part] = $ctype;
+ $part_bytes[$part] = 0 if $cd !~ /attachment/;
+
+ my $previous = '';
+ foreach ( @{$p->raw()} ) {
+ if ( $cte =~ /base64/i ) {
+ if ($previous =~ /^$/ && /^TV[pq]QAA[MI]AAAAEAA[8A]A/) {
+ $self->{microsoft_executable} = 1;
+ }
+ if ($previous =~ /^\s*$/ && /^\s*$/) {
+ $self->{mime_base64_blanks} = 1;
+ }
+ if (m@[^A-Za-z0-9+/=\n]@ || /=[^=\s]/) {
+ $self->{mime_base64_illegal} = 1;
+ }
}
- if ($self->{mime_html_no_charset} &&
- $ctype =~ /^text\/html/ &&
- /charset=/i)
- {
+
+ if ($self->{mime_html_no_charset} && $ctype eq 'text/html' && defined $charset) {
$self->{mime_html_no_charset} = 0;
}
- if ($self->{mime_multipart_alternative} &&
- $ctype =~ /^text\/(?:plain|html)/i &&
- $cd !~ /attachment/)
- {
+ if ($self->{mime_multipart_alternative} && $cd !~ /attachment/ &&
+ ( $ctype eq 'text/plain' || $ctype eq 'text/html' ) ) {
$part_bytes[$part] += length;
}
- }
- if ($where == 1) {
- if (/^$/) {
- $where = 2;
- $part++;
- $part_bytes[$part] = 0 if $cd !~ /attachment/;
- $part_type[$part] = $ctype;
- $self->_check_mime_header($ctype, $cte, $cd, $charset, $name);
- }
- if (/$re_boundary/) { push(@boundary, "\Q$1\E"); }
- if (/$re_charset/) { $charset = lc($1); }
- if (/$re_name/) { $name = lc($1); }
- if (/$re_ctype/) { $ctype = lc($1); }
- elsif (/$re_cte/) { $cte = lc($1); }
- elsif (/$re_cd/) { $cd = lc($1); }
- }
- if ($previous =~ /^begin [0-7]{3} ./ && /^M35J0``,````\$````/) {
- $self->{microsoft_executable} = 1;
- }
- if ($where != 1 && $cte eq "quoted-printable" && ! /^SPAM: /) {
- if (length > 77) {
- $self->{mime_qp_long_line} = 1;
- }
- $qp_bytes += length;
- # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
- # control characters other than TAB, or CR and LF as parts of CRLF pairs
- if (!$self->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
- {
- $self->{mime_qp_illegal} = 1;
+
+ if ($previous =~ /^begin [0-7]{3} ./ && /^M35J0``,````\$````/) {
+ $self->{microsoft_executable} = 1;
}
- # count excessive QP bytes
- if (index($_, '=') != -1) {
- # whoever wrote this next line is an evil hacker -- jm
- my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
- if ($qp) {
- $qp_count += $qp;
- # tabs and spaces at end of encoded line are okay. Also, multiple
- # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
- my ($trailing) = m/((?:=09|=20)+)\s*$/g;
- if ($trailing) {
- $qp_count -= (length($trailing) / 3);
+ if ($where != 1 && $cte eq "quoted-printable" && ! /^SPAM: /) {
+ if (length > 77) {
+ $self->{mime_qp_long_line} = 1;
+ }
+ $qp_bytes += length;
+ # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
+ # control characters other than TAB, or CR and LF as parts of CRLF pairs
+ if (!$self->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
+ {
+ $self->{mime_qp_illegal} = 1;
+ }
+ # count excessive QP bytes
+ if (index($_, '=') != -1) {
+ # whoever wrote this next line is an evil hacker -- jm
+ my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
+ if ($qp) {
+ $qp_count += $qp;
+ # tabs and spaces at end of encoded line are okay. Also, multiple
+ # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
+ my ($trailing) = m/((?:=09|=20)+)\s*$/g;
+ if ($trailing) {
+ $qp_count -= (length($trailing) / 3);
+ }
}
- }
+ }
}
+ $previous = $_;
}
- $previous = $_;
}
+
if ($qp_bytes) {
$self->{mime_qp_ratio} = $qp_count / $qp_bytes;
}
@@ -2617,10 +2581,10 @@
my $html;
for (my $i = 0; $i <= $part; $i++) {
next if !defined $part_bytes[$i];
- if (!defined($html) && $part_type[$i] =~ /^text\/html/i) {
+ if (!defined($html) && $part_type[$i] eq 'text/html') {
$html = $part_bytes[$i];
}
- if (!defined($text) && $part_type[$i] =~ /^text\/plain/i) {
+ if (!defined($text) && $part_type[$i] eq 'text/plain') {
$text = $part_bytes[$i];
}
}

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Wed Feb 4 14:28:34 2004
@@ -180,6 +180,14 @@
push @{ $self->{'body_parts'} }, $part;
}

+=item is_root()
+
+=cut
+
+sub is_root {
+ return ! exists $_[0]->{'raw'};
+}
+
=item raw()

Return a reference to the the raw array.