Mailing List Archive

r3685 - in trunk: c_src/KinoSearch/Highlight perl perl/lib perl/t
Author: creamyg
Date: 2008-07-30 19:24:12 -0700 (Wed, 30 Jul 2008)
New Revision: 3685

Modified:
trunk/c_src/KinoSearch/Highlight/Highlighter.bp
trunk/c_src/KinoSearch/Highlight/Highlighter.c
trunk/perl/Build.PL
trunk/perl/lib/KinoSearch.pm
trunk/perl/t/303-highlighter.t
Log:
Hand-roll an encode_entities function for Highlighter and eliminate the
dependency on HTML::Entities. Fix Highlighter so that Encode() and
Highlight() can both be overridden as advertised. Speed up highlighting when
Encode() hasn't been overridden by reusing a CharBuf.


Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 02:20:52 UTC (rev 3684)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 02:24:12 UTC (rev 3685)
@@ -23,7 +23,7 @@
public incremented CharBuf*
Create_Excerpt(Highlighter *self, HitDoc *hit_doc);

- incremented CharBuf*
+ public incremented CharBuf*
Encode(Highlighter *self, CharBuf *text);

/** Find the sentence boundaries within the specified range, returning
@@ -41,7 +41,7 @@
Find_Sentence_Boundaries(Highlighter *self, CharBuf *text, u32_t offset = 0,
u32_t length = 0);

- incremented CharBuf*
+ public incremented CharBuf*
Highlight(Highlighter *self, const CharBuf *text);

void

Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.c
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 02:20:52 UTC (rev 3684)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 02:24:12 UTC (rev 3685)
@@ -1,3 +1,4 @@
+#include <ctype.h>
#include "KinoSearch/Util/ToolSet.h"

#include "KinoSearch/Highlight/Highlighter.h"
@@ -12,6 +13,20 @@
#include "KinoSearch/Util/IntMap.h"
#include "KinoSearch/Util/Native.h"

+/* If Highlighter_Encode has been overridden, return its output. If not,
+ * increment the refcount of the supplied encode_buf and call encode_entities.
+ * Either way, the caller takes responsibility for one refcount.
+ *
+ * The point of this routine is to minimize CharBuf object creation when
+ * possible.
+ */
+static CharBuf*
+do_encode(Highlighter *self, CharBuf *text, CharBuf **encode_buf);
+
+/* Place HTML entity encoded version of [text] into [encoded]. */
+static CharBuf*
+encode_entities(CharBuf *text, CharBuf *encoded);
+
Highlighter*
Highlighter_new(Searchable *searchable, Obj *query, const CharBuf *field,
u32_t excerpt_length)
@@ -329,6 +344,7 @@
u32_t i;
i32_t last_end = 0;
ZombieCharBuf temp = ZCB_make(raw_excerpt);
+ CharBuf *encode_buf = NULL;

for (i = 0; i < spans->size; i++) {
Span *span = (Span*)VA_Fetch(spans, i);
@@ -344,7 +360,7 @@
i32_t non_highlighted_len = relative_start - last_end;
ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp);
ZCB_Truncate(&to_cat, non_highlighted_len);
- encoded = Highlighter_Encode(self, (CharBuf*)&to_cat);
+ encoded = do_encode(self, (CharBuf*)&to_cat, &encode_buf);
CB_Cat(highlighted, (CharBuf*)encoded);
ZCB_Nip(&temp, non_highlighted_len);
REFCOUNT_DEC(encoded);
@@ -355,7 +371,7 @@
i32_t highlighted_len = relative_end - relative_start;
ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp);
ZCB_Truncate(&to_cat, highlighted_len);
- encoded = Highlighter_Encode(self, (CharBuf*)&to_cat);
+ encoded = do_encode(self, (CharBuf*)&to_cat, &encode_buf);
hl_frag = Highlighter_Highlight(self, encoded);
CB_Cat(highlighted, hl_frag);
ZCB_Nip(&temp, highlighted_len);
@@ -368,11 +384,13 @@

/* Last text, beyond last highlight span. */
{
- CharBuf *encoded = Highlighter_Encode(self, (CharBuf*)&temp);
+ CharBuf *encoded = do_encode(self, (CharBuf*)&temp, &encode_buf);
CB_Cat(highlighted, encoded);
REFCOUNT_DEC(encoded);
}
CB_Trim_Tail(highlighted);
+
+ REFCOUNT_DEC(encode_buf);
}

static INLINE void
@@ -452,10 +470,75 @@
CharBuf*
Highlighter_encode(Highlighter *self, CharBuf *text)
{
- return (CharBuf*)Native_callback_str(self, "do_encode", 1,
- ARG_STR("text", text));
+ CharBuf *encoded = CB_new(0);
+ UNUSED_VAR(self);
+ return encode_entities(text, encoded);
}

+static CharBuf*
+do_encode(Highlighter *self, CharBuf *text, CharBuf **encode_buf)
+{
+ if (METHOD_OVERRIDDEN(self, Highlighter, Encode, encode)) {
+ return Highlighter_Encode(self, text);
+ }
+ else {
+ if (*encode_buf == NULL) *encode_buf = CB_new(0);
+ (void)encode_entities(text, *encode_buf);
+ return REFCOUNT_INC(*encode_buf);
+ }
+}
+
+static CharBuf*
+encode_entities(CharBuf *text, CharBuf *encoded)
+{
+ ZombieCharBuf temp = ZCB_make(text);
+ size_t space = 0;
+ u32_t code_point;
+ const int MAX_ENTITY_BYTES = 9; /* &#dddddd; */
+
+ /* Scan first so that we only allocate once. */
+ while (0 != (code_point = ZCB_Nip_One(&temp))) {
+ if ( code_point > 127
+ || (!isgraph(code_point) && !isspace(code_point))
+ || code_point == '<'
+ || code_point == '>'
+ || code_point == '&'
+ || code_point == '"'
+ ) {
+ space += MAX_ENTITY_BYTES;
+ }
+ else {
+ space += 1;
+ }
+ }
+
+ CB_Grow(encoded, space);
+ CB_Set_Size(encoded, 0);
+ ZCB_Assign(&temp, text);
+ while (0 != (code_point = ZCB_Nip_One(&temp))) {
+ if (code_point > 127 || (!isgraph(code_point) && !isspace(code_point))) {
+ CB_catf(encoded, "&#%u32;", code_point);
+ }
+ else if (code_point == '<') {
+ CB_Cat_Trusted_Str(encoded, "&lt;", 4);
+ }
+ else if (code_point == '>') {
+ CB_Cat_Trusted_Str(encoded, "&gt;", 4);
+ }
+ else if (code_point == '&') {
+ CB_Cat_Trusted_Str(encoded, "&amp;", 5);
+ }
+ else if (code_point == '"') {
+ CB_Cat_Trusted_Str(encoded, "&quot;", 6);
+ }
+ else {
+ CB_Cat_Char(encoded, code_point);
+ }
+ }
+
+ return encoded;
+}
+
void
Highlighter_destroy(Highlighter *self)
{

Modified: trunk/perl/Build.PL
===================================================================
--- trunk/perl/Build.PL 2008-07-31 02:20:52 UTC (rev 3684)
+++ trunk/perl/Build.PL 2008-07-31 02:24:12 UTC (rev 3685)
@@ -42,7 +42,6 @@
'Compress::Zlib' => 0,
'Lingua::Stem::Snowball' => 0.94,
'Lingua::StopWords' => 0.06,
- 'HTML::Entities' => 1,
'JSON::XS' => 2.01,
},
build_requires => {

Modified: trunk/perl/lib/KinoSearch.pm
===================================================================
--- trunk/perl/lib/KinoSearch.pm 2008-07-31 02:20:52 UTC (rev 3684)
+++ trunk/perl/lib/KinoSearch.pm 2008-07-31 02:24:12 UTC (rev 3685)
@@ -391,19 +391,7 @@

{
package KinoSearch::Highlight::Highlighter;
- use KinoSearch::Util::ToolSet qw( confess a_isa_b );

- my $html_entities_loaded = 0;
-
- sub do_encode {
- if ( !$html_entities_loaded ) {
- require HTML::Entities;
- HTML::Entities->import('encode_entities');
- $html_entities_loaded = 1;
- }
- return encode_entities( $_[1] );
- }
-
sub find_sentence_boundaries {
my $self = shift;
return $self->_fsb(@_)->to_arrayref;

Modified: trunk/perl/t/303-highlighter.t
===================================================================
--- trunk/perl/t/303-highlighter.t 2008-07-31 02:20:52 UTC (rev 3684)
+++ trunk/perl/t/303-highlighter.t 2008-07-31 02:24:12 UTC (rev 3685)
@@ -17,9 +17,23 @@

sub analyzer { KinoSearch::Analysis::Tokenizer->new }

+package MyHighlighter;
+use base qw( KinoSearch::Highlight::Highlighter );
+
+sub encode {
+ my ( $self, $text ) = @_;
+ $text =~ s/blind/wise/;
+ return $text;
+}
+
+sub highlight {
+ my ( $self, $text ) = @_;
+ return "*$text*";
+}
+
package main;

-use Test::More tests => 33;
+use Test::More tests => 34;

binmode( STDOUT, ":utf8" );

@@ -30,7 +44,7 @@
use KinoSearch::Store::RAMFolder;

my $phi = "\x{03a6}";
-my $encoded_phi = "&Phi;";
+my $encoded_phi = "&#934;";

my $string = '1 2 3 4 5 ' x 20; # 200 characters
$string .= "$phi a b c d x y z h i j k ";
@@ -321,6 +335,16 @@
"Skip leading whitespace but get first sentence"
);

+$hl = MyHighlighter->new(
+ searchable => $searcher,
+ query => "blind",
+ field => 'content',
+);
+$hits = $searcher->search( query => 'blind' );
+$hit = $hits->fetch_hit;
+like( $hl->create_excerpt($hit), qr/\*wise\*/,
+ "override both Encode() and Highlight()" );
+
sub make_cb {
return KinoSearch::Util::CharBuf->new(shift);
}


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits