Of the remaining tasks, the attached patch addresses these:
> * Add perfunctory tests for analyze_field to the relevant test files.
> o 150-polyanalyzer.t
> o 151-analyzer.t
> o 153-lc_normalizer.t
> o 154-tokenizer.t
> o 155-stopalizer.t
> o 156-stemmer.t
>
>
> * Change SegWriter to use analyze_field. [NOTE: Marvin did this]
> * Add optimized analyze_field implementations to LCNormalizer and
> PolyAnalyzer.
> * Add optimized analyze_field implementation to Tokenizer. This one's
> harder because it requires some advanced XS.
>
> * Copy and paste the utf8ify code into StringHelper.pm.
> * Add some tests to verify that it works.
> * Replace calls to utf8::upgrade with utf8ify.
> * We'll skip moving the utf8 conversion from InvIndexer to the
> Analyzers for now, since that has other implications.
>
I'm not convinced that the approach I took in Tokenizer's XS was most optimal.
But it passes all tests.
Still TODO:
> * Test that you can mod a document's contents, using code nearly
> identical to what will end up in the Swish/KS glue eventually.
> * Expand Analyzer's docs with regard to subclassing.
I expect I'll be able to do both of those once I think a little more about what
I want Swish to do.
--
Peter Karman .
http://peknet.com/ . peter@peknet.com
-------------- next part --------------
Index: buildlib/KinoTestUtils.pm
===================================================================
--- buildlib/KinoTestUtils.pm (revision 2433)
+++ buildlib/KinoTestUtils.pm (working copy)
@@ -150,6 +150,13 @@
@got = $analyzer->analyze_raw($source);
Test::More::is_deeply( \@got, $expected, "analyze_raw: $message" );
+
+ $batch = $analyzer->analyze_field({content => $source}, 'content');
+ @got = ();
+ while ( my $token = $batch->next ) {
+ push @got, $token->get_text;
+ }
+ Test::More::is_deeply( \@got, $expected, "analyze_field: $message" );
}
1;
Index: t/508-hits.t
===================================================================
--- t/508-hits.t (revision 2433)
+++ t/508-hits.t (working copy)
@@ -8,7 +8,7 @@
use KinoSearch::Searcher;
use KinoTestUtils qw( create_invindex );
-my @docs = ( 'a b', 'a a b', 'a a a b', 'x' );
+my @docs = ( 'a b', 'a a b', 'a a a b', 'x' );
my $invindex = create_invindex(@docs);
my $searcher = KinoSearch::Searcher->new( invindex => $invindex, );
Index: t/154-tokenizer.t
===================================================================
--- t/154-tokenizer.t (revision 2433)
+++ t/154-tokenizer.t (working copy)
@@ -1,7 +1,7 @@
use strict;
use warnings;
-use Test::More tests => 8;
+use Test::More tests => 9;
use KinoSearch::Analysis::Tokenizer;
use KinoSearch::Analysis::TokenBatch;
@@ -51,3 +51,13 @@
[ 'a', ' ', 'b', ' ', 'c' ],
"no freakout when fed multiple tokens"
);
+
+$batch->reset;
+$tokenizer = KinoSearch::Analysis::Tokenizer->new();
+$batch
+ = $tokenizer->analyze_field( { monroe => 'some like it hot' }, 'monroe' );
+@token_texts = ();
+while ( my $token = $batch->next ) {
+ push @token_texts, $token->get_text;
+}
+is_deeply( \@token_texts, [ 'some', 'like', 'it', 'hot' ], "analyze_field" );
Index: t/601-queryparser.t
===================================================================
--- t/601-queryparser.t (revision 2433)
+++ t/601-queryparser.t (working copy)
@@ -39,7 +39,7 @@
sub analyzer { KinoSearch::Analysis::Tokenizer->new }
package main;
-use Test::More tests => 210;
+use Test::More tests => 212;
use KinoSearch::QueryParser::QueryParser;
use KinoSearch::Analysis::PolyAnalyzer;
@@ -47,7 +47,7 @@
use KinoSearch::InvIndexer;
use KinoSearch::Searcher;
use KinoSearch::Store::RAMFolder;
-use KinoSearch::Util::StringHelper qw( utf8_flag_on );
+use KinoSearch::Util::StringHelper qw( utf8_flag_on utf8ify );
use KinoTestUtils qw( create_invindex );
@@ -197,6 +197,16 @@
$hits = $searcher->search( query => $motorhead );
is( $hits->total_hits, 1, "QueryParser parses UTF-8 strings correctly" );
+$motorhead = "Mot\xF6rhead";
+utf8ify($motorhead);
+$unicode_invindex = create_invindex($motorhead);
+$searcher = KinoSearch::Searcher->new( invindex => $unicode_invindex, );
+
+$hits = $searcher->search( query => 'Mot' );
+is( $hits->total_hits, 0, "Pre-test - indexing worked properly" );
+$hits = $searcher->search( query => $motorhead );
+is( $hits->total_hits, 1, "QueryParser utf8ifys UTF-8 strings correctly" );
+
my $mf_folder = KinoSearch::Store::RAMFolder->new;
my $mf_schema = MultiFieldSchema->new;
my $mf_invindex = KinoSearch::InvIndex->create(
Index: t/519-and_or_scorer.t
===================================================================
--- t/519-and_or_scorer.t (revision 2433)
+++ t/519-and_or_scorer.t (working copy)
@@ -123,7 +123,7 @@
my $score_docs = $hc->get_hit_queue->score_docs;
my @by_score_then_num = map { $_->get_doc_num }
sort {
- $b->get_score <=> $a->get_score
+ $b->get_score <=> $a->get_score
|| $a->get_doc_num <=> $b->get_doc_num
} @$score_docs;
my @by_num = sort { $a <=> $b } @by_score_then_num;
Index: t/155-stopalizer.t
===================================================================
--- t/155-stopalizer.t (revision 2433)
+++ t/155-stopalizer.t (working copy)
@@ -2,7 +2,7 @@
use warnings;
use lib 'buildlib';
-use Test::More tests => 6;
+use Test::More tests => 8;
use KinoTestUtils qw( test_analyzer );
use KinoSearch::Analysis::Stopalizer;
Index: t/016-varray.t
===================================================================
--- t/016-varray.t (revision 2433)
+++ t/016-varray.t (working copy)
@@ -10,7 +10,7 @@
my ( $varray, @orig, @got );
$varray = KinoSearch::Util::VArray->new( capacity => 0 );
-@orig = 1 .. 10;
+@orig = 1 .. 10;
$varray->push( KinoSearch::Util::ByteBuf->new($_) ) for @orig;
is( $varray->get_size, 10, "get_size after pushing 10 elements" );
Index: t/215-term_vectors.t
===================================================================
--- t/215-term_vectors.t (revision 2433)
+++ t/215-term_vectors.t (working copy)
@@ -47,12 +47,12 @@
$invindexer->finish;
my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
-my $doc_vec = $searcher->fetch_doc_vec(0);
+my $doc_vec = $searcher->fetch_doc_vec(0);
my $term_vector = $doc_vec->term_vector( "content", "foo" );
ok( defined $term_vector, "successfully retrieved term vector" );
-$doc_vec = $searcher->fetch_doc_vec(1);
+$doc_vec = $searcher->fetch_doc_vec(1);
$term_vector = $doc_vec->term_vector( 'content', 'ma??ana' );
ok( defined $term_vector, "utf-8 term vector retrieved" );
Index: t/151-analyzer.t
===================================================================
--- t/151-analyzer.t (revision 2433)
+++ t/151-analyzer.t (working copy)
@@ -2,7 +2,7 @@
use warnings;
use lib 'buildlib';
-use Test::More tests => 5;
+use Test::More tests => 6;
use KinoSearch::Analysis::Analyzer;
use KinoTestUtils qw( utf8_test_strings test_analyzer );
Index: t/505-hit_queue.t
===================================================================
--- t/505-hit_queue.t (revision 2433)
+++ t/505-hit_queue.t (working copy)
@@ -26,7 +26,8 @@
} @docs_and_scores;
my @correct_order = sort {
- $b->get_score <=> $a->get_score or $a->get_doc_num <=> $b->get_doc_num
+ $b->get_score <=> $a->get_score
+ or $a->get_doc_num <=> $b->get_doc_num
} @score_docs;
my @correct_docs = map { $_->get_doc_num } @correct_order;
my @correct_scores = map { $_->get_score } @correct_order;
Index: t/153-lc_normalizer.t
===================================================================
--- t/153-lc_normalizer.t (revision 2433)
+++ t/153-lc_normalizer.t (working copy)
@@ -2,7 +2,7 @@
use warnings;
use lib 'buildlib';
-use Test::More tests => 3;
+use Test::More tests => 4;
use KinoTestUtils qw( test_analyzer );
use KinoSearch::Analysis::LCNormalizer;
Index: t/518-or_scorer.t
===================================================================
--- t/518-or_scorer.t (revision 2433)
+++ t/518-or_scorer.t (working copy)
@@ -83,7 +83,7 @@
perform_search( [ 'a' .. $_ ] ) for 'a' .. 'z';
sub perform_search {
- my $letters = shift;
+ my $letters = shift;
my $letter_string = join ' ', @$letters;
my $subscorers
@@ -125,8 +125,8 @@
my @doc_nums = keys %{ $letters{$letter} };
$counts{$_} += 1 for @doc_nums;
}
- my @by_count_then_num =
- sort { $counts{$b} <=> $counts{$a} || $a <=> $b }
+ my @by_count_then_num
+ = sort { $counts{$b} <=> $counts{$a} || $a <=> $b }
keys %counts;
my @by_num = sort { $a <=> $b } @by_count_then_num;
@@ -139,7 +139,7 @@
my $score_docs = $hc->get_hit_queue->score_docs;
my @by_score_then_num = map { $_->get_doc_num }
sort {
- $b->get_score <=> $a->get_score
+ $b->get_score <=> $a->get_score
|| $a->get_doc_num <=> $b->get_doc_num
} @$score_docs;
my @by_num = sort { $a <=> $b } @by_score_then_num;
Index: t/012-priority_queue.t
===================================================================
--- t/012-priority_queue.t (revision 2433)
+++ t/012-priority_queue.t (working copy)
@@ -30,7 +30,7 @@
);
1 while defined $pq->pop; # empty queue;
-$pq = KinoSearch::Util::PriorityQueue->new( max_size => 5 );
+$pq = KinoSearch::Util::PriorityQueue->new( max_size => 5 );
@prioritized = ();
$pq->insert($_) for ( 1 .. 10, -3, 1590 .. 1600, 5 );
@@ -50,4 +50,3 @@
$pq->insert( splice( @nums, $tick, 1 ) );
}
is_deeply( $pq->pop_all, [ reverse 1 .. 100 ], "random order insertion" );
-
Index: t/156-stemmer.t
===================================================================
--- t/156-stemmer.t (revision 2433)
+++ t/156-stemmer.t (working copy)
@@ -2,7 +2,7 @@
use warnings;
use lib 'buildlib';
-use Test::More tests => 6;
+use Test::More tests => 8;
use KinoTestUtils qw( test_analyzer );
use KinoSearch::Analysis::Stemmer;
Index: t/514-and_scorer.t
===================================================================
--- t/514-and_scorer.t (revision 2433)
+++ t/514-and_scorer.t (working copy)
@@ -19,7 +19,7 @@
push @docs, ('c d x');
my $invindex = create_invindex(@docs);
-my $searcher = KinoSearch::Searcher->new( invindex => $invindex, );
+my $searcher = KinoSearch::Searcher->new( invindex => $invindex, );
my $similarity = KinoSearch::Search::Similarity->new;
my $c_query = KinoSearch::Search::TermQuery->new(
@@ -96,4 +96,3 @@
}
return \@doc_nums;
}
-
Index: t/150-polyanalyzer.t
===================================================================
--- t/150-polyanalyzer.t (revision 2433)
+++ t/150-polyanalyzer.t (working copy)
@@ -2,7 +2,7 @@
use warnings;
use lib 'buildlib';
-use Test::More tests => 15;
+use Test::More tests => 20;
use KinoTestUtils qw( test_analyzer );
Index: t/013-bit_vector.t
===================================================================
--- t/013-bit_vector.t (revision 2433)
+++ t/013-bit_vector.t (working copy)
@@ -60,7 +60,7 @@
}
}
-my @set_1 = ( 1 .. 3, 10, 20, 30 );
+my @set_1 = ( 1 .. 3, 10, 20, 30 );
my @set_2 = ( 2 .. 10, 25 .. 35 );
$bit_vec = KinoSearch::Util::BitVector->new;
Index: lib/KinoSearch/QueryParser/QueryParser.pm
===================================================================
--- lib/KinoSearch/QueryParser/QueryParser.pm (revision 2433)
+++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy)
@@ -3,6 +3,7 @@
package KinoSearch::QueryParser::QueryParser;
use KinoSearch::Util::ToolSet;
+use KinoSearch::Util::StringHelper qw( utf8ify );
use base qw( KinoSearch::Util::Class );
our %instance_vars = (
@@ -100,7 +101,7 @@
sub parse {
my ( $self, $qstring_orig ) = @_;
$qstring_orig = '' unless defined $qstring_orig;
- utf8::upgrade($qstring_orig);
+ utf8ify($qstring_orig);
my $default_fields = $self->{fields};
my $default_boolop = $self->{default_boolop};
my @clauses;
Index: lib/KinoSearch/Analysis/PolyAnalyzer.pm
===================================================================
--- lib/KinoSearch/Analysis/PolyAnalyzer.pm (revision 2433)
+++ lib/KinoSearch/Analysis/PolyAnalyzer.pm (working copy)
@@ -18,7 +18,7 @@
use KinoSearch::Analysis::Stemmer;
sub init_instance {
- my $self = shift;
+ my $self = shift;
my $language = $self->{language} = lc( $self->{language} );
# create a default set of analyzers if language was specified
@@ -61,6 +61,24 @@
}
}
+sub analyze_field {
+ my $analyzers = $_[0]->{analyzers};
+
+ if ( !@$analyzers ) {
+ return KinoSearch::Analysis::TokenBatch->new(
+ text => $_[1]->{ $_[2] } );
+ }
+ elsif ( @$analyzers == 1 ) {
+ return $analyzers->[0]->analyze_field( $_[1], $_[2] );
+ }
+ else {
+ my $batch = $analyzers->[0]->analyze_field( $_[1], $_[2] );
+ $batch = $_->analyze_batch($batch)
+ for @{$analyzers}[ 1 .. $#$analyzers ];
+ return $batch;
+ }
+}
+
1;
__END__
Index: lib/KinoSearch/Analysis/Tokenizer.pm
===================================================================
--- lib/KinoSearch/Analysis/Tokenizer.pm (revision 2433)
+++ lib/KinoSearch/Analysis/Tokenizer.pm (working copy)
@@ -25,12 +25,13 @@
MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::Tokenizer
kino_TokenBatch*
-_do_analyze(self_hv, batch_or_text_sv)
+_do_analyze(self_hv, batch_or_text_sv, ...)
HV *self_hv;
SV *batch_or_text_sv;
ALIAS:
analyze_batch = 1
analyze_text = 2
+ analyze_field = 3
CODE:
{
kino_TokenBatch *batch = NULL;
@@ -40,11 +41,30 @@
chy_u32_t num_code_points = 0;
SV *wrapper = sv_newmortal();
RETVAL = kino_TokenBatch_new(NULL);
-
+ char *string = NULL;
+ STRLEN string_len = 0;
+
if (ix == 1) {
EXTRACT_STRUCT( batch_or_text_sv, batch, kino_TokenBatch*,
"KinoSearch::Analysis::TokenBatch");
}
+ if (ix == 2) {
+ string = SvPVutf8( ST(1), string_len );
+ }
+ if (ix == 3) {
+ if (items != 3)
+ CONFESS("analyze_text() takes 2 arguments, got %d", items - 1);
+ if (!SvROK(batch_or_text_sv))
+ CONFESS("first argument to analyze_text() must be hash ref");
+
+ STRLEN len;
+ char *field_name = SvPV(ST(2), len);
+ string = SvPVutf8(extract_sv(
+ (HV*)SvRV(batch_or_text_sv),
+ field_name,
+ len),
+ string_len);
+ }
/* extract regexp struct from qr// entity */
if (SvROK(token_re)) {
@@ -63,7 +83,6 @@
SvUTF8_on(wrapper);
while (1) {
- STRLEN len;
char *string_beg;
char *string_end;
char *string_arg;
@@ -72,20 +91,20 @@
kino_Token *token = Kino_TokenBatch_Next(batch);
if (token == NULL)
break;
- len = token->len;
+ string_len = token->len;
string_beg = token->text;
- string_end = string_beg + len;
+ string_end = string_beg + string_len;
string_arg = string_beg;
}
else {
- string_beg = SvPVutf8( ST(1), len );
- string_end = string_beg + len;
+ string_beg = string;
+ string_end = string_beg + string_len;
string_arg = string_beg;
}
/* wrap the string in an SV to please the regex engine */
SvPVX(wrapper) = string_beg;
- SvCUR_set(wrapper, len);
+ SvCUR_set(wrapper, string_len);
SvPOK_on(wrapper);
while (
@@ -128,7 +147,7 @@
REFCOUNT_DEC(new_token);
}
- if (ix == 2) /* analyze_text only runs one loop iter */
+ if (ix > 1) /* analyze_text and analyze_field only run one loop iter */
break;
}
}
Index: lib/KinoSearch/Analysis/Stopalizer.pm
===================================================================
--- lib/KinoSearch/Analysis/Stopalizer.pm (revision 2433)
+++ lib/KinoSearch/Analysis/Stopalizer.pm (working copy)
@@ -16,7 +16,7 @@
use Lingua::StopWords;
sub init_instance {
- my $self = shift;
+ my $self = shift;
my $language = $self->{language} = lc( $self->{language} );
# verify a supplied stoplist
@@ -139,4 +139,3 @@
See L<KinoSearch> version 0.20.
=cut
-
Index: lib/KinoSearch/Analysis/LCNormalizer.pm
===================================================================
--- lib/KinoSearch/Analysis/LCNormalizer.pm (revision 2433)
+++ lib/KinoSearch/Analysis/LCNormalizer.pm (working copy)
@@ -31,6 +31,12 @@
return $_[0]->analyze_batch($batch);
}
+sub analyze_field {
+ my $batch = KinoSearch::Analysis::TokenBatch->new(
+ text => lc( $_[1]->{ $_[2] } ) );
+ return $_[0]->analyze_batch($batch);
+}
+
1;
__END__
Index: lib/KinoSearch/Index/SegWriter.pm
===================================================================
--- lib/KinoSearch/Index/SegWriter.pm (revision 2433)
+++ lib/KinoSearch/Index/SegWriter.pm (working copy)
@@ -3,6 +3,7 @@
package KinoSearch::Index::SegWriter;
use KinoSearch::Util::ToolSet;
+use KinoSearch::Util::StringHelper qw( utf8ify );
use base qw( KinoSearch::Util::Class );
our %instance_vars = (
@@ -91,7 +92,7 @@
# upgrade fields that aren't binary to utf8
if ( !$field_spec->binary ) {
- utf8::upgrade( $doc->{$field_name} );
+ utf8ify( $doc->{$field_name} );
}
next unless $field_spec->indexed;
Index: lib/KinoSearch/Util/StringHelper.pm
===================================================================
--- lib/KinoSearch/Util/StringHelper.pm (revision 2433)
+++ lib/KinoSearch/Util/StringHelper.pm (working copy)
@@ -9,6 +9,7 @@
utf8_flag_off
to_base36
from_base36
+ utf8ify
);
1;
@@ -62,6 +63,19 @@
RETVAL = strtol(str, NULL, 36);
OUTPUT: RETVAL
+=for comment
+
+Upgrade a SV to UTF8, converting Latin1 if necessary. Equivalent to utf::upgrade().
+
+=cut
+
+void
+utf8ify(sv)
+ SV *sv;
+PPCODE:
+ sv_utf8_upgrade(sv);
+
+
__POD__
=begin devdocs