Mailing List Archive

r3687 - in trunk: c_src/KinoSearch/Highlight perl/lib perl/lib/KinoSearch/Highlight perl/t
Author: creamyg
Date: 2008-07-31 08:57:38 -0700 (Thu, 31 Jul 2008)
New Revision: 3687

Modified:
trunk/c_src/KinoSearch/Highlight/Highlighter.bp
trunk/c_src/KinoSearch/Highlight/Highlighter.c
trunk/perl/lib/KinoSearch.pm
trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm
trunk/perl/t/303-highlighter.t
Log:
Represent sentence boundaries in the highlighter using an array of Spans,
communicating both offset and length rather than an array of integers
communicating offset only. Replace Find_Sentence_Boundaries() with
Find_Sentences().


Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 03:01:32 UTC (rev 3686)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 15:57:38 UTC (rev 3687)
@@ -26,9 +26,11 @@
public incremented CharBuf*
Encode(Highlighter *self, CharBuf *text);

- /** Find the sentence boundaries within the specified range, returning
- * them as an array of offsets (offset from 0, not from
- * <code>start</code>).
+ /** Find sentence boundaries within the specified range, returning them as
+ * an array of Spans. The "offset" of each Span indicates the start of
+ * the sentence, and is measured from 0, not from <code>offset</code>.
+ * The Span's "length" member indicates the sentence length in code
+ * points.
*
* @param text The string to scan.
* @param offset The place to start looking for offsets, measured in
@@ -37,9 +39,9 @@
* scan. The default value of 0 is a sentinel which indicates to scan
* until the end of the string.
*/
- incremented IntMap*
- Find_Sentence_Boundaries(Highlighter *self, CharBuf *text, i32_t offset = 0,
- i32_t length = 0);
+ incremented VArray*
+ Find_Sentences(Highlighter *self, CharBuf *text, i32_t offset = 0,
+ i32_t length = 0);

public incremented CharBuf*
Highlight(Highlighter *self, const CharBuf *text);
@@ -75,7 +77,7 @@
i32_t
Raw_Excerpt(Highlighter *self, const CharBuf *field_val,
const CharBuf *fragment, CharBuf *raw_excerpt, i32_t top,
- IntMap *edges);
+ VArray *sentences);

/* Take the text in raw_excerpt, add highlight tags, encode, and place the
* result into [highlighted].

Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.c
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 03:01:32 UTC (rev 3686)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 15:57:38 UTC (rev 3687)
@@ -10,7 +10,6 @@
#include "KinoSearch/Search/Span.h"
#include "KinoSearch/Index/DocVector.h"
#include "KinoSearch/Util/ByteBuf.h"
-#include "KinoSearch/Util/IntMap.h"
#include "KinoSearch/Util/Native.h"

/* If Highlighter_Encode has been overridden, return its output. If not,
@@ -102,11 +101,11 @@
self->excerpt_length * 0.6666);
i32_t top = Highlighter_Find_Best_Fragment(self, (CharBuf*)field_val,
(ViewCharBuf*)&fragment, heat_map);
- IntMap *edges = Highlighter_Find_Sentence_Boundaries(self,
+ VArray *sentences = Highlighter_Find_Sentences(self,
(CharBuf*)field_val, top, self->window_width);

top = Highlighter_Raw_Excerpt(self, (CharBuf*)field_val,
- (CharBuf*)&fragment, raw_excerpt, top, edges);
+ (CharBuf*)&fragment, raw_excerpt, top, sentences);
VA_Sort(score_spans, Span_compare);
Highlighter_highlight_excerpt(self, score_spans, raw_excerpt,
highlighted, top);
@@ -115,7 +114,7 @@
REFCOUNT_DEC(score_spans);
REFCOUNT_DEC(heat_map);
REFCOUNT_DEC(raw_excerpt);
- REFCOUNT_DEC(edges);
+ REFCOUNT_DEC(sentences);

return highlighted;
}
@@ -194,7 +193,7 @@
i32_t
Highlighter_raw_excerpt(Highlighter *self, const CharBuf *field_val,
const CharBuf *fragment, CharBuf *raw_excerpt,
- i32_t top, IntMap *edges)
+ i32_t top, VArray *sentences)
{
bool_t found_starting_edge = false;
bool_t found_ending_edge = false;
@@ -203,11 +202,12 @@
i32_t this_excerpt_len;

/* Try to find a starting sentence boundary. */
- if (edges->size) {
+ if (sentences->size) {
u32_t i;

- for (i = 0; i < edges->size; i++) {
- i32_t candidate = IntMap_Get(edges, i);
+ for (i = 0; i < sentences->size; i++) {
+ Span *sentence = (Span*)VA_Fetch(sentences, i);
+ i32_t candidate = sentence->offset;

if (candidate < top){
continue;
@@ -235,13 +235,14 @@
}

/* Try to end on a sentence boundary (but don't try very hard). */
- if(edges->size) {
+ if(sentences->size) {
u32_t i;
ZombieCharBuf start_trimmed = ZCB_make(fragment);
ZCB_Nip(&start_trimmed, start - top);

- for (i = edges->size; i--; ) {
- i32_t last_edge = IntMap_Get(edges, i);
+ for (i = sentences->size; i--; ) {
+ Span *sentence = (Span*)VA_Fetch(sentences, i);
+ i32_t last_edge = sentence->offset + sentence->length;

if (last_edge <= start) {
/* Sanity. */
@@ -393,78 +394,97 @@
REFCOUNT_DEC(encode_buf);
}

-static INLINE void
-add_bound(i32_t pos, ByteBuf *bounds_bb) {
- if (bounds_bb->cap - bounds_bb->len < sizeof(i32_t)) {
- BB_Grow(bounds_bb, bounds_bb->len + 10 * sizeof(i32_t));
- }
- *(i32_t*)BBEND(bounds_bb) = pos;
- bounds_bb->len += sizeof(u32_t);
+static void
+close_sentence(VArray *sentences, Span **sentence_ptr, i32_t sentence_end)
+{
+ Span *sentence = *sentence_ptr;
+ i32_t length = sentence_end - Span_Get_Offset(sentence);
+ Span_Set_Length(sentence, length);
+ VA_Push(sentences, (Obj*)sentence);
+ REFCOUNT_DEC(sentence);
+ *sentence_ptr = NULL;
}

-IntMap*
-Highlighter_find_sentence_boundaries(Highlighter *self, CharBuf *text,
- i32_t offset, i32_t length)
+VArray*
+Highlighter_find_sentences(Highlighter *self, CharBuf *text, i32_t offset,
+ i32_t length)
{
- ByteBuf *bounds_bb = BB_new(10 * sizeof(u32_t));
- i32_t max = length == 0
- ? I32_MAX
- : offset + length;
- ZombieCharBuf substring = ZCB_make(text);
- i32_t pos = ZCB_Trim_Top(&substring);
+ /* When [sentence] is NULL, that means a sentence start has not yet been
+ * found. When it is a Span object, we have a start, but we haven't found
+ * an end. Once we find the end, we add the sentence to the [sentences]
+ * array and set [sentence] back to NULL to indicate that we're looking
+ * for a start once more.
+ */
+ Span *sentence = NULL;
+ VArray *sentences = VA_new(10);
+ i32_t excerpt_end = length == 0
+ ? I32_MAX
+ : offset + length;
+ ZombieCharBuf fragment = ZCB_make(text);
+ i32_t pos;
UNUSED_VAR(self);

- if (offset <= pos) {
+ /* Our first task will be to find a sentence that either starts at the top
+ * of the fragment, or overlaps its start. Starting at 0 -- i.e. the top
+ * of the field -- is a special case. We define the first non-whitespace
+ * character to begin a sentence, rather than look for the first character
+ * following a period and whitespace. Everywhere else, we have to define
+ * sentence starts based on a sentence end that has just passed by.
+ */
+ if (offset == 0) {
+ pos = ZCB_Trim_Top(&fragment);
/* Assume that first non-whitespace character begins a sentence. */
- if (pos < max && ZCB_Get_Size(&substring) > 0) {
- add_bound(pos, bounds_bb);
+ if (pos < excerpt_end && ZCB_Get_Size(&fragment) > 0) {
+ sentence = Span_new(pos, 0, 0.0);
}
}
- pos += ZCB_Nip(&substring, offset - pos);
+ else {
+ pos = ZCB_Nip(&fragment, offset);
+ }

- while (pos < max) {
- u32_t code_point = ZCB_Code_Point_At(&substring, 0);
+ while (1) {
+ u32_t code_point = ZCB_Code_Point_At(&fragment, 0);
if (!code_point) {
- /* End of substring. Add a bound if it's also the end of the field,
+ /* End of fragment. If we have a sentence open, close it,
* then bail. */
- if (substring.ptr == CBEND(text))
- add_bound(pos, bounds_bb);
-
+ if (sentence) close_sentence(sentences, &sentence, pos);
break;
}
else if (code_point == '.') {
u32_t whitespace_count;
- pos += ZCB_Nip(&substring, 1); /* advance past "." */
+ pos += ZCB_Nip(&fragment, 1); /* advance past "." */

- if (pos == max && ZCB_Get_Size(&substring) == 0) {
+ if (pos == excerpt_end && ZCB_Get_Size(&fragment) == 0) {
/* Period ending the field string. */
- add_bound(pos, bounds_bb);
+ if (sentence) close_sentence(sentences, &sentence, pos);
break;
}
- else if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) {
+ else if (0 != (whitespace_count = ZCB_Trim_Top(&fragment))) {
+ /* We've found a period followed by whitespace. Close out the
+ * existing sentence, if there is one. */
+ if (sentence) close_sentence(sentences, &sentence, pos);
+
/* Advance past whitespace. */
pos += whitespace_count;
- if (pos < max && ZCB_Get_Size(&substring) > 0) {
- /* Not at the end of the string? Then success! */
- add_bound(pos, bounds_bb);
+ if (pos < excerpt_end && ZCB_Get_Size(&fragment) > 0) {
+ /* Not at the end of the string? Then we've found a
+ * sentence start. */
+ sentence = Span_new(pos, 0, 0.0);
}
}
+
+ /* We may not have reached the end of the field yet, but it's
+ * entirely possible that our last sentence overlapped the end of
+ * the fragment -- in which case, it's time to bail. */
+ if (pos >= excerpt_end) break;
}
else {
- ZCB_Nip(&substring, 1);
+ ZCB_Nip(&fragment, 1);
pos++;
}
}

- {
- u32_t num_bounds = bounds_bb->len / sizeof(u32_t);
- IntMap *retval;
- i32_t *ints = MALLOCATE(num_bounds, i32_t);
- memcpy(ints, bounds_bb->ptr, bounds_bb->len);
- retval = IntMap_new(ints, num_bounds);
- REFCOUNT_DEC(bounds_bb);
- return retval;
- }
+ return sentences;
}

CharBuf*

Modified: trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-31 03:01:32 UTC (rev 3686)
+++ trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-31 15:57:38 UTC (rev 3687)
@@ -18,7 +18,7 @@
get_pre_tag
set_post_tag
get_post_tag
- _fsb|find_sentence_boundaries )
+ _find_sentences|find_sentences )
],
make_getters =>
[qw( searchable query excerpt_length compiler field )],

Modified: trunk/perl/lib/KinoSearch.pm
===================================================================
--- trunk/perl/lib/KinoSearch.pm 2008-07-31 03:01:32 UTC (rev 3686)
+++ trunk/perl/lib/KinoSearch.pm 2008-07-31 15:57:38 UTC (rev 3687)
@@ -392,9 +392,9 @@
{
package KinoSearch::Highlight::Highlighter;

- sub find_sentence_boundaries {
+ sub find_sentences {
my $self = shift;
- return $self->_fsb(@_)->to_arrayref;
+ return $self->_find_sentences(@_)->to_pobj;
}
}


Modified: trunk/perl/t/303-highlighter.t
===================================================================
--- trunk/perl/t/303-highlighter.t 2008-07-31 03:01:32 UTC (rev 3686)
+++ trunk/perl/t/303-highlighter.t 2008-07-31 15:57:38 UTC (rev 3687)
@@ -81,7 +81,7 @@
my $top = $hl->_find_best_fragment(
fragment => $target,
field_val => $field_val,
- heat_map => make_heat_map( [ 2, 1 ] ),
+ heat_map => make_heat_map( [ 2, 1, 1.0 ] ),
);
is( $target->to_perl, "$phi $phi b", "Find_Best_Fragment" );
is( $top, 2, "correct offset returned by Find_Best_Fragment" );
@@ -90,7 +90,7 @@
$top = $hl->_find_best_fragment(
fragment => $target,
field_val => $field_val,
- heat_map => make_heat_map( [ 2, 1 ] ),
+ heat_map => make_heat_map( [ 2, 1, 1.0 ] ),
);
is( $target->to_perl, $field_val->to_perl,
"Find_Best_Fragment returns whole field when field is short" );
@@ -100,7 +100,7 @@
$top = $hl->_find_best_fragment(
fragment => $target,
field_val => $field_val,
- heat_map => make_heat_map( [ 6, 2 ] ),
+ heat_map => make_heat_map( [ 6, 2, 1.0 ] ),
);
is( $target->to_perl, "b$phi$phi",
"Find_Best_Fragment shifts left to deal with overrun" );
@@ -110,7 +110,7 @@
$top = $hl->_find_best_fragment(
fragment => $target,
field_val => $field_val,
- heat_map => make_heat_map( [ 0, 1 ] ),
+ heat_map => make_heat_map( [ 0, 1, 1.0 ] ),
);
is( $target->to_perl,
"a$phi" . "bcd",
@@ -132,9 +132,9 @@
fragment => "Ook. Urk.",
raw_excerpt => $target,
top => 0,
- edges => make_int_map( 0, 6 ),
+ sentences => make_spans( [ 0, 4, 0 ], [ 6, 4, 0 ] ),
);
-is( $target->to_perl, "Ook. ", "Raw_Excerpt at top" );
+is( $target->to_perl, "Ook.", "Raw_Excerpt at top" );
is( $top, 0, "top still 0" );

$target = make_cb("");
@@ -143,29 +143,30 @@
fragment => ". Urk. I",
raw_excerpt => $target,
top => 3,
- edges => make_int_map( 6, 12 ),
+ sentences => make_spans( [ 6, 4, 0 ], [ 12, 4, 0 ] ),
);
-is( $target->to_perl, "Urk. ", "Raw_Excerpt in middle, with 2 bounds" );
+is( $target->to_perl, "Urk.", "Raw_Excerpt in middle, with 2 bounds" );
is( $top, 6, "top in the middle modified by Raw_Excerpt" );

-$target = make_cb("");
-$top = $hl->_raw_excerpt(
- field_val => "Ook urk ick iz",
- fragment => "ick iz",
+$target = make_cb("");
+$field_val = "Ook urk ick i.";
+$top = $hl->_raw_excerpt(
+ field_val => $field_val,
+ fragment => "ick i.",
raw_excerpt => $target,
top => 8,
- edges => make_int_map(14),
+ sentences => make_spans( [ 0, length($field_val), 0 ] ),
);
-is( $target->to_perl, "... iz", "Ellipsis at top" );
+is( $target->to_perl, "... i.", "Ellipsis at top" );
is( $top, 8, "top correct when leading ellipsis inserted" );

$target = make_cb("");
-$top = $hl->_raw_excerpt(
- field_val => "Urk. Iz no good.",
+$field_val = "Urk. Iz no good.", $top = $hl->_raw_excerpt(
+ field_val => $field_val,
fragment => " Iz no go",
raw_excerpt => $target,
top => 4,
- edges => make_int_map(6),
+ sentences => make_spans( [ 6, length($field_val) - 6, 0 ] ),
);
is( $target->to_perl, "Iz...", "Ellipsis at end" );
is( $top, 6, "top trimmed" );
@@ -189,7 +190,7 @@
$target = make_cb("");
$hl->_highlight_excerpt(
raw_excerpt => "$phi $phi $phi",
- spans => make_spans( [ 2, 1 ] ),
+ spans => make_spans( [ 2, 1, 1.0 ] ),
top => 0,
highlighted => $target,
);
@@ -202,7 +203,7 @@
$target = make_cb("");
$hl->_highlight_excerpt(
raw_excerpt => "$phi $phi $phi",
- spans => make_spans( [ 3, 1 ] ),
+ spans => make_spans( [ 3, 1, 1.0 ] ),
top => 1,
highlighted => $target,
);
@@ -298,74 +299,86 @@
"... but not another field"
);

-my $sentences = 'This is a sentence. ' x 15;
+my $sentence_text = 'This is a sentence. ' x 15;
$hl = KinoSearch::Highlight::Highlighter->new(
searchable => $searcher,
query => $q,
field => 'content',
);
-is_deeply(
- $hl->find_sentence_boundaries(
- text => $sentences,
- offset => 101,
- length => 50,
- ),
- [ 120, 140 ],
- 'find_sentence_boundaries in list context with explicit args'
+my $sentences = $hl->find_sentences(
+ text => $sentence_text,
+ offset => 101,
+ length => 50,
);
is_deeply(
- $hl->find_sentence_boundaries(
- text => $sentences,
- offset => 101,
- length => 4,
- ),
- [],
- 'fsb with explicit args, finding nothing'
+ spans_to_arg_array($sentences),
+ [ [ 120, 19, 0 ], [ 140, 19, 0 ] ],
+ 'find_sentences with explicit args'
);
-is_deeply(
- $hl->find_sentence_boundaries( text => $sentences ),
- [. 0, 20, 40, 60, 80, 100, 120, 140,
- 160, 180, 200, 220, 240, 260, 280, 300
- ],
- 'fsb with default offset and length'
+
+$sentences = $hl->find_sentences(
+ text => $sentence_text,
+ offset => 101,
+ length => 4,
);
+is_deeply( spans_to_arg_array($sentences),
+ [], 'find_sentences with explicit args, finding nothing' );
+
+my @expected;
+for my $i ( 0 .. 14 ) {
+ push @expected, [ $i * 20, 19, 0 ];
+}
+$sentences = $hl->find_sentences( text => $sentence_text );
+is_deeply( spans_to_arg_array($sentences),
+ \@expected, 'find_sentences with default offset and length' );
+
+$sentences = $hl->find_sentences( text => ' Foo' );
is_deeply(
- $hl->find_sentence_boundaries( text => ' Foo' ),
- [ 1, 4 ],
+ spans_to_arg_array($sentences),
+ [ [ 1, 3, 0 ] ],
"Skip leading whitespace but get first sentence"
);

$hl = MyHighlighter->new(
- searchable => $searcher,
- query => "blind",
- field => 'content',
+ searchable => $searcher,
+ query => "blind",
+ field => 'content',
);
$hits = $searcher->search( query => 'blind' );
$hit = $hits->fetch_hit;
-like( $hl->create_excerpt($hit), qr/\*wise\*/,
- "override both Encode() and Highlight()" );
+like( $hl->create_excerpt($hit),
+ qr/\*wise\*/, "override both Encode() and Highlight()" );

sub make_cb {
return KinoSearch::Util::CharBuf->new(shift);
}

+sub make_heat_map {
+ return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) );
+}
+
+sub make_span {
+ return KinoSearch::Search::Span->new(
+ offset => $_[0],
+ length => $_[1],
+ weight => $_[2],
+ );
+}
+
sub make_spans {
- my $spans = KinoSearch::Util::VArray->new( capacity => @_ / 2 );
+ my $spans = KinoSearch::Util::VArray->new( capacity => scalar @_ );
for my $span_spec (@_) {
- my $span = KinoSearch::Search::Span->new(
- offset => $span_spec->[0],
- length => $span_spec->[1],
- weight => 1,
- );
- $spans->push($span);
+ $spans->push( make_span( @{$span_spec}[ 0 .. 2 ] ) );
}
return $spans;
}

-sub make_heat_map {
- return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) );
+sub spans_to_arg_array {
+ my $spans = shift;
+ my @out;
+ for (@$spans) {
+ push @out, [ $_->get_offset, $_->get_length, $_->get_weight ];
+ }
+ return \@out;
}

-sub make_int_map {
- return KinoSearch::Util::IntMap->new( ints => [@_] );
-}


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits