Mailing List Archive

rt branch, 5.0/fix-html-to-plaintext-conversion-to-preserve-quoting, created. rt-5.0.0-2-g6ac1190929
The branch, 5.0/fix-html-to-plaintext-conversion-to-preserve-quoting has been created
at 6ac1190929ae21ce5bdda3e1b0cc2d6bf4ccbc06 (commit)

- Log -----------------------------------------------------------------
commit 6ac1190929ae21ce5bdda3e1b0cc2d6bf4ccbc06
Author: Dianne Skoll <dianne@bestpractical.com>
Date: Mon Aug 3 09:29:48 2020 -0400

Ticket #224369: Preserve quoting levels when converting HTML to plain text

When you reply to a message in RT using the rich text editor, RT creates
a text/plain version corresponding to the text/html version. However,
quoted text is enclosed in a <blockquote></blockquote> pair and this
makes quoted text in the text/plain part difficult to discern.

This commit uses heuristics to modify the text/plain output for each
of the supported converters to detect blockquoted material and replace
it with the standard text/plain quote prefix ">".

Note that lynx does not give us any way to detect blockquoted material,
so lynx should be marked as a deprecated converter. All of the other
supported converters do give us a way to heuristically detect blockquoted
material.

diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index f1b97cfe0c..88241e0718 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1452,6 +1452,39 @@ sub _RecordSendEmailFailure {
}
}

+# Hash describing how various formatters format <blockquote>...</blockquote>
+# regions.
+my $BlockquoteDescriptor = {
+ w3m => { indent => 4},
+ elinks => { indent => 2},
+ links => { indent => 2},
+ html2text => { indent => 5},
+ lynx => { indent => 2},
+ core => { indent => 2},
+};
+
+=head3 ConvertBlockquoteIndentsToQuotemarks
+
+Given plain text that has been converted from HTML to text, adjust
+it to quote blockquote regions with ">".
+
+=cut
+sub ConvertBlockquoteIndentsToQuotemarks {
+ my ($text, $converter) = @_;
+
+ return $text unless exists($BlockquoteDescriptor->{$converter});
+ my $desc = $BlockquoteDescriptor->{$converter};
+ my $spaces;
+
+ my $n = $desc->{indent};
+ $spaces = ' ' x $n;
+
+ # Convert each level of indentation to a ">"; add a space aferwards
+ # for readability
+ $text =~ s|^(($spaces)+)|">" x (length($1)/$n) . " "|gem;
+ return $text;
+}
+
=head3 ConvertHTMLToText HTML

Takes HTML characters and converts it to plain text characters.
@@ -1466,7 +1499,10 @@ sub ConvertHTMLToText {

sub _HTMLFormatter {
state $formatter;
- return $formatter if defined $formatter;
+
+ # If we are running under the test harness, we want to create
+ # a new $formatter each time rather than once and caching.
+ return $formatter if defined $formatter && !$ENV{HARNESS_ACTIVE};

my $wanted = RT->Config->Get("HTMLFormatter");
my @options = ("w3m", "elinks", "links", "html2text", "lynx", "core");
@@ -1529,7 +1565,7 @@ sub _HTMLFormatter {
);
};
$text = Encode::decode( "UTF-8", $text );
- return $text;
+ return ConvertBlockquoteIndentsToQuotemarks($text, $prog);
};
}
RT->Config->Set( HTMLFormatter => $prog );
@@ -1558,7 +1594,7 @@ sub _HTMLFormatText {
$text //= '';
};
$RT::Logger->error("Failed to downgrade HTML to plain text: $@") if $@;
- return $text;
+ return ConvertBlockquoteIndentsToQuotemarks($text, 'core');
}


diff --git a/t/mail/html-to-text.t b/t/mail/html-to-text.t
new file mode 100644
index 0000000000..ca71b27f10
--- /dev/null
+++ b/t/mail/html-to-text.t
@@ -0,0 +1,114 @@
+use strict;
+use warnings;
+
+use RT::Test nodb => 1, tests => undef;
+
+my $html = <<'EOF';
+<html>
+ <head>
+ <title>Test HTML</title>
+ </head>
+ <body>
+ <p>This is a top-level paragraph.</p>
+ <blockquote>
+ <p>This is a first-level quoted paragraph</p>
+ <blockquote>
+ <p>This is a second-level quoted paragraph</p>
+ <p>So is this</p>
+ </blockquote>
+ <p>Back to first-level</p>
+ </blockquote>
+ <p>Back to top-level</p>
+ </body>
+</html>
+EOF
+
+my $expected = <<'EOF';
+This is a top-level paragraph.
+
+> This is a first-level quoted paragraph
+
+>> This is a second-level quoted paragraph
+
+>> So is this
+
+> Back to first-level
+
+Back to top-level
+EOF
+
+my $expected_links = <<'EOF';
+This is a top-level paragraph.
+
+> This is a first-level quoted paragraph
+
+> This is a second-level quoted paragraph
+
+> So is this
+
+> Back to first-level
+
+Back to top-level
+
+EOF
+
+my $expected_html2text = <<'EOF';
+This is a top-level paragraph.
+> This is a first-level quoted paragraph
+>> This is a second-level quoted paragraph
+>> So is this
+> Back to first-level
+Back to top-level
+EOF
+
+# Lynx messes up; no way to preserve quoting. :(
+my $expected_lynx = <<'EOF';
+This is a top-level paragraph.
+
+This is a first-level quoted paragraph
+
+This is a second-level quoted paragraph
+
+So is this
+
+Back to first-level
+
+Back to top-level
+EOF
+
+sub prog_on_path
+{
+ my ($prog) = @_;
+ return 1 if $prog eq 'core';
+ foreach my $dir (split(/:/, $ENV{PATH})) {
+ return 1 if -x "$dir/$prog";
+ }
+ return 0;
+}
+
+sub test_conversion
+{
+ my ($converter, $expected) = @_;
+ SKIP: {
+ if (!prog_on_path($converter)) {
+ skip "Skipping $converter: Not installed", 1;
+ return;
+ }
+ RT->Config->Set(HTMLFormatter => $converter);
+ my $text = RT::Interface::Email::ConvertHTMLToText($html);
+ is($text, $expected, "Got expected HTML->text conversion using $converter");
+ }
+}
+
+# Set environment variable to force creation of a new
+# formatter each time.
+$ENV{HARNESS_ACTIVE} = 1;
+
+test_conversion('w3m', "$expected\n"); # w3m adds a blank line at the end
+test_conversion('elinks', $expected);
+test_conversion('links', $expected);
+test_conversion('html2text', $expected_html2text);
+test_conversion('lynx', $expected_lynx);
+test_conversion('core', "Test HTML\n\n$expected\n"); # core adds title and blank line
+done_testing();
+1;

-----------------------------------------------------------------------
_______________________________________________
rt-commit mailing list
rt-commit@lists.bestpractical.com
http://lists.bestpractical.com/cgi-bin/mailman/listinfo/rt-commit