Mailing List Archive

Negate Operator bug fixed.
Greets,

The "negate operator bug" in QueryParser-- which also affected other
multi-field queries with required/negated terms -- has been stomped.

QueryParser's interface has changed slightly: the 'default_field'
parameter has been deprecated and is being replaced by the plural
'fields', which should be an arrayref.

my $query_parser = KinoSearch::QueryParser::QueryParser->new(
analyzer => $analyzer,
fields => [ 'body', 'title' ],
);

I decided not to subclass QueryParser, as Java Lucene does with
MultiFieldQueryParser, since the ability to spec multiple fields is
really the only difference between those two classes, and I think
that it's more common to be searching multiple fields than just one.

A happy secondary effect is that Searcher's _prepare_simple_search
method is now a little easier to grok. Tony had suggested that it
would be nice if the transition from searching on strings to creating
Query objects wasn't quite so steep; it's just been rounded off a bit.

Since the svn trunk has now taken on the UTF-8 challenge and broken
backwards compatibility, a maintenance branch has been split off from
the 0.12 release. Version 0.13, to be released shortly, will be
taken from this branch, as will all future 0.1x releases. What is
now in trunk will probably come out as 0.20.


Marvin Humphrey

--
I'm looking for a part-time job.


slothbear:~/projects/ksmaint marvin$ svn diff
Index: lib/KinoSearch/QueryParser/QueryParser.pm
===================================================================
--- lib/KinoSearch/QueryParser/QueryParser.pm (revision 1032)
+++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy)
@@ -9,7 +9,8 @@
# constructor args / members
analyzer => undef,
default_boolop => 'OR',
- default_field => undef,
+ default_field => undef, # back compat
+ fields => undef,
# members
bool_groups => {},
phrases => {},
@@ -41,6 +42,20 @@
# create labels which won't appear in search strings
$self->{phrase_re} = qr/^(_phrase$randstring\d+)/;
$self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/;
+
+ # verify fields param
+ my $fields =
+ defined $self->{fields}
+ ? $self->{fields}
+ : [ $self->{default_field} ];
+ croak("Required parameter 'fields' not supplied as arrayref")
+ unless ( defined $fields
+ and reftype($fields) eq 'ARRAY' );
+ $self->{fields} = $fields;
+
+ # verify analyzer
+ croak("Missing required param 'analyzer'")
+ unless a_isa_b( $self->{analyzer},
'KinoSearch::Analysis::Analyzer' );
}
# regex matching a quoted string
@@ -83,7 +98,7 @@
sub parse {
my ( $self, $qstring_orig ) = @_;
$qstring_orig = '' unless defined $qstring_orig;
- my $default_field = $self->{default_field};
+ my $default_fields = $self->{fields};
my $default_boolop = $self->{default_boolop};
my @clauses;
@@ -124,7 +139,7 @@
}
# set the field
- my $field = s/^$field_re// ? $1 : $default_field;
+ my $fields = s/^$field_re// ? [$1] : $default_fields;
# if a phrase label is detected...
if (s/$self->{phrase_re}//) {
@@ -133,24 +148,11 @@
# retreive the text and analyze it
my $orig_phrase_text = delete $self->{phrases}{$1};
my $token_texts = $self->_analyze($orig_phrase_text);
-
- # create a TermQuery, a PhraseQuery, or nothing
- if ( @$token_texts == 1 ) {
- my $term = KinoSearch::Index::Term->new( $field,
- $token_texts->[0] );
- $query = KinoSearch::Search::TermQuery->new( term =>
$term );
+ if (@$token_texts) {
+ my $query = $self->_get_field_query( $fields,
$token_texts );
+ push @clauses, { query => $query, occur => $occur }
+ if defined $query;
}
- elsif ( @$token_texts > 1 ) {
- $query = KinoSearch::Search::PhraseQuery->new;
- for my $token_text (@$token_texts) {
- $query->add_term(
- KinoSearch::Index::Term->new( $field,
$token_text ),
- );
- }
- }
-
- push @clauses, { query => $query, occur => $occur }
- if defined $query;
}
# if a label indicating a bool group is detected...
elsif (s/$self->{bool_group_re}//) {
@@ -162,12 +164,9 @@
# what's left is probably a term
elsif (s/([^"(\s]+)//) {
my $token_texts = $self->_analyze($1);
- my @terms = map { KinoSearch::Index::Term->new( $field,
$_ ) }
- grep { $_ ne '' }
- @$token_texts;
- for my $term (@terms) {
- my $query
- = KinoSearch::Search::TermQuery->new( term =>
$term );
+ @$token_texts = grep { $_ ne '' } @$token_texts;
+ if (@$token_texts) {
+ my $query = $self->_get_field_query( $fields,
$token_texts );
push @clauses, { occur => $occur, query => $query };
}
}
@@ -190,6 +189,50 @@
}
}
+# Wrap a TermQuery/PhraseQuery to deal with multiple fields.
+sub _get_field_query {
+ my ( $self, $fields, $token_texts ) = @_;
+
+ my @queries = grep { defined $_ }
+ map { $self->_gen_single_field_query( $_, $token_texts ) } @
$fields;
+
+ if ( @queries == 0 ) {
+ return;
+ }
+ elsif ( @queries == 1 ) {
+ return $queries[0];
+ }
+ else {
+ my $wrapper_query = KinoSearch::Search::BooleanQuery->new;
+ for my $query (@queries) {
+ $wrapper_query->add_clause(
+ query => $query,
+ occur => 'SHOULD',
+ );
+ }
+ return $wrapper_query;
+ }
+}
+
+# Create a TermQuery, a PhraseQuery, or nothing.
+sub _gen_single_field_query {
+ my ( $self, $field, $token_texts ) = @_;
+
+ if ( @$token_texts == 1 ) {
+ my $term = KinoSearch::Index::Term->new( $field,
$token_texts->[0] );
+ return KinoSearch::Search::TermQuery->new( term => $term );
+ }
+ elsif ( @$token_texts > 1 ) {
+ my $phrase_query = KinoSearch::Search::PhraseQuery->new;
+ for my $token_text (@$token_texts) {
+ $phrase_query->add_term(
+ KinoSearch::Index::Term->new( $field, $token_text ),
+ );
+ }
+ return $phrase_query;
+ }
+}
+
# break a string into tokens
sub _analyze {
my ( $self, $string ) = @_;
@@ -247,8 +290,8 @@
=head1 SYNOPSIS
my $query_parser = KinoSearch::QueryParser::QueryParser->new(
- analyzer => $analyzer,
- default_field => 'bodytext',
+ analyzer => $analyzer,
+ fields => [ 'bodytext' ],
);
my $query = $query_parser->parse( $query_string );
my $hits = $searcher->search( query => $query );
@@ -286,7 +329,7 @@
Field-specific terms, in the form of C<fieldname:termtext>. (The field
specified by fieldname will be used instead of the QueryParser's default
-field).
+fields).
=back
@@ -295,9 +338,9 @@
=head2 new
my $query_parser = KinoSearch::QueryParser::QueryParser->new(
- analyzer => $analyzer, # required
- default_field => 'bodytext', # required
- default_boolop => 'AND', # default: 'OR'
+ analyzer => $analyzer, # required
+ fields => [ 'bodytext' ], # required
+ default_boolop => 'AND', # default: 'OR'
);
Constructor. Takes hash-style parameters:
@@ -313,12 +356,15 @@
=item *
-B<default_field> - the name of the (only) field which will be searched
-against. If you need to search multiple fields, you need multiple
QueryParser
-objects.
+B<fields> - the names of the fields which will be searched against.
Must be
+supplied as an arrayref.
=item *
+B<default_field> - deprecated. Use C<fields> instead.
+
+=item *
+
B<default_boolop> - two possible values: 'AND' and 'OR'. The default
is 'OR',
which means: return documents which match any of the query terms. If
you
want only documents which match all of the query terms, set this to
'AND'.
Index: lib/KinoSearch/Searcher.pm
===================================================================
--- lib/KinoSearch/Searcher.pm (revision 1032)
+++ lib/KinoSearch/Searcher.pm (working copy)
@@ -82,23 +82,13 @@
sub _prepare_simple_search {
my ( $self, $query_string ) = @_;
- # add each parsed query as a boolean clause to a super-query
- my $super_query = KinoSearch::Search::BooleanQuery->new;
my $indexed_field_names
= $self->{reader}->get_field_names( indexed => 1 );
- for my $field_name (@$indexed_field_names) {
- my $query_parser = KinoSearch::QueryParser::QueryParser->new(
- default_field => $field_name,
- analyzer => $self->{analyzer},
- );
- my $sub_query = $query_parser->parse($query_string);
- $super_query->add_clause(
- query => $sub_query,
- occur => 'SHOULD',
- );
- }
-
- return $super_query;
+ my $query_parser = KinoSearch::QueryParser::QueryParser->new(
+ fields => $indexed_field_names,
+ analyzer => $self->{analyzer},
+ );
+ return $query_parser->parse($query_string);
}
my %search_hit_collector_args = (
Index: MANIFEST
===================================================================
--- MANIFEST (revision 1032)
+++ MANIFEST (working copy)
@@ -145,6 +145,7 @@
t/601-queryparser.t
t/602-boosts.t
t/603-query_boosts.t
+t/604-simple_search.t
t/701-uscon.t
t/999-remove_invindexes.t
t/benchmarks/extract_reuters.plx