Mailing List Archive

r3878 - in trunk: c_src/KinoSearch/Posting perl perl/lib/KinoSearch/Posting perl/t
Author: creamyg
Date: 2008-09-17 20:01:28 -0700 (Wed, 17 Sep 2008)
New Revision: 3878

Added:
trunk/perl/t/400-match_posting.t
Modified:
trunk/c_src/KinoSearch/Posting/MatchPosting.bp
trunk/c_src/KinoSearch/Posting/MatchPosting.c
trunk/c_src/KinoSearch/Posting/ScorePosting.bp
trunk/c_src/KinoSearch/Posting/ScorePosting.c
trunk/perl/MANIFEST
trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm
trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm
Log:
Finish a preliminary implementation of MatchPosting. This version includes freq,
which is not ideal -- but the primary rationale is testing, not a public API.


Modified: trunk/c_src/KinoSearch/Posting/MatchPosting.bp
===================================================================
--- trunk/c_src/KinoSearch/Posting/MatchPosting.bp 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/c_src/KinoSearch/Posting/MatchPosting.bp 2008-09-18 03:01:28 UTC (rev 3878)
@@ -2,8 +2,6 @@

/** Match but not score documents.
*
- * TODO: This class is not yet fully implemented.
- *
* Use MatchPosting for fields which only need to be matched, not scored. For
* instance, if you need to determine that that a query matches a particular
* category, but don't want the match to contribute to the document score, use
@@ -13,6 +11,7 @@
extends KinoSearch::Posting {

Similarity *sim;
+ u32_t freq;

/* Constructor.
*/
@@ -23,14 +22,31 @@
init(MatchPosting *self, Similarity *similarity);

void
+ Destroy(MatchPosting *self);
+
+ incremented MatchPosting*
+ Clone(MatchPosting *self);
+
+ void
+ Read_Record(MatchPosting *self, InStream *instream);
+
+ incremented RawPosting*
+ Read_Raw(MatchPosting *self, InStream *instream, i32_t last_doc_num,
+ CharBuf *term_text, MemoryPool *mem_pool);
+
+ void
+ Add_Inversion_To_Pool(MatchPosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ i32_t doc_num, float doc_boost,
+ float length_norm);
+
+ void
Reset(MatchPosting *self, i32_t doc_num);

public incremented MatchPostingScorer*
Make_Scorer(MatchPosting *self, Similarity *sim, PostingList *plist,
Compiler *compiler);

- void
- Destroy(MatchPosting *self);
}

class KinoSearch::Posting::MatchPostingScorer cnick MatchPostScorer

Modified: trunk/c_src/KinoSearch/Posting/MatchPosting.c
===================================================================
--- trunk/c_src/KinoSearch/Posting/MatchPosting.c 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/c_src/KinoSearch/Posting/MatchPosting.c 2008-09-18 03:01:28 UTC (rev 3878)
@@ -1,10 +1,22 @@
#include "KinoSearch/Util/ToolSet.h"

#include "KinoSearch/Posting/MatchPosting.h"
+#include "KinoSearch/Analysis/Inversion.h"
+#include "KinoSearch/Analysis/Token.h"
+#include "KinoSearch/FieldSpec.h"
#include "KinoSearch/Index/PostingList.h"
+#include "KinoSearch/Index/PostingPool.h"
+#include "KinoSearch/Posting/RawPosting.h"
#include "KinoSearch/Search/Similarity.h"
#include "KinoSearch/Search/Compiler.h"
+#include "KinoSearch/Store/InStream.h"
+#include "KinoSearch/Util/MemoryPool.h"

+#define MAX_RAW_POSTING_LEN(_text_len) \
+ ( sizeof(RawPosting) \
+ + _text_len + 1 /* term text content */ \
+ )
+
MatchPosting*
MatchPost_new(Similarity *sim)
{
@@ -26,12 +38,72 @@
FREE_OBJ(self);
}

+MatchPosting*
+MatchPost_clone(MatchPosting *self)
+{
+ MatchPosting *evil_twin = (MatchPosting*)VTable_Make_Obj(self->_);
+ return MatchPost_init(evil_twin, self->sim);
+}
+
void
MatchPost_reset(MatchPosting *self, i32_t doc_num)
{
self->doc_num = doc_num;
}

+void
+MatchPost_read_record(MatchPosting *self, InStream *instream)
+{
+ const u32_t doc_code = InStream_Read_C32(instream);
+ const u32_t doc_delta = doc_code >> 1;
+
+ /* Apply delta doc and retrieve freq. */
+ self->doc_num += doc_delta;
+ if (doc_code & 1)
+ self->freq = 1;
+ else
+ self->freq = InStream_Read_C32(instream);
+}
+
+RawPosting*
+MatchPost_read_raw(MatchPosting *self, InStream *instream, i32_t last_doc_num,
+ CharBuf *term_text, MemoryPool *mem_pool)
+{
+ const size_t text_size = CB_Get_Size(term_text);
+ const u32_t doc_code = InStream_Read_C32(instream);
+ const u32_t delta_doc = doc_code >> 1;
+ const i32_t doc_num = last_doc_num + delta_doc;
+ const u32_t freq = (doc_code & 1)
+ ? 1
+ : InStream_Read_C32(instream);
+ size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size);
+ void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes);
+
+ return RawPost_new(allocation, doc_num, freq, term_text->ptr, text_size);
+}
+
+void
+MatchPost_add_inversion_to_pool(MatchPosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ i32_t doc_num, float doc_boost,
+ float length_norm)
+{
+ MemoryPool *mem_pool = post_pool->mem_pool;
+ Token **tokens;
+ u32_t freq;
+
+ Inversion_Reset(inversion);
+ while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
+ Token *token = *tokens;
+ u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len);
+ RawPosting *raw_posting = RawPost_new(
+ MemPool_Grab(mem_pool, raw_post_bytes), doc_num, freq,
+ token->text, token->len
+ );
+ PostPool_Add_Elem(post_pool, (Obj*)raw_posting);
+ }
+}
+
MatchPostingScorer*
MatchPost_make_scorer(MatchPosting *self, Similarity *sim,
PostingList *plist, Compiler *compiler)

Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.bp
===================================================================
--- trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-09-18 03:01:28 UTC (rev 3878)
@@ -9,7 +9,6 @@
class KinoSearch::Posting::ScorePosting cnick ScorePost
extends KinoSearch::Posting::MatchPosting {

- u32_t freq;
float weight;
u32_t *prox;
u32_t prox_cap;

Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.c
===================================================================
--- trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-09-18 03:01:28 UTC (rev 3878)
@@ -46,8 +46,7 @@
ScorePosting*
ScorePost_clone(ScorePosting *self)
{
- VTable *vtable = self->_;
- ScorePosting *evil_twin = (ScorePosting*)CREATE(NULL, (*vtable));
+ ScorePosting *evil_twin = (ScorePosting*)VTable_Make_Obj(self->_);
ScorePost_init(evil_twin, self->sim);

if (self->freq) {

Modified: trunk/perl/MANIFEST
===================================================================
--- trunk/perl/MANIFEST 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/perl/MANIFEST 2008-09-18 03:01:28 UTC (rev 3878)
@@ -307,6 +307,7 @@
t/308-simple.t
t/309-span.t
t/310-heat_map.t
+t/400-match_posting.t
t/501-termquery.t
t/502-phrasequery.t
t/504-similarity.t

Modified: trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm 2008-09-18 03:01:28 UTC (rev 3878)
@@ -18,6 +18,7 @@

{ "KinoSearch::Posting::MatchPosting" => {
make_constructors => ["new"],
+ make_getters => [qw( freq )],
# make_pod => {
# synopsis => $synopsis,
# }

Modified: trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm 2008-09-18 01:18:15 UTC (rev 3877)
+++ trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm 2008-09-18 03:01:28 UTC (rev 3878)
@@ -19,7 +19,7 @@

{ "KinoSearch::Posting::ScorePosting" => {
make_constructors => ["new"],
- make_getters => [qw( freq weight )],
+ make_getters => [qw( weight )],
# make_pod => {
# synopsis => $synopsis,
# }

Added: trunk/perl/t/400-match_posting.t
===================================================================
--- trunk/perl/t/400-match_posting.t (rev 0)
+++ trunk/perl/t/400-match_posting.t 2008-09-18 03:01:28 UTC (rev 3878)
@@ -0,0 +1,78 @@
+use strict;
+use warnings;
+use lib 'buildlib';
+
+package MatchSchema::MatchOnly;
+use base qw( KinoSearch::FieldSpec::TextField );
+use KinoSearch::Posting::MatchPosting;
+
+sub posting {
+ if ( @_ == 2 ) {
+ return KinoSearch::Posting::MatchPosting->new( similarity => $_[1] );
+ }
+ else {
+ shift;
+ return KinoSearch::Posting::MatchPosting->new(@_);
+ }
+}
+
+package MatchSchema;
+use base qw( KinoSearch::Schema );
+use KinoSearch::Analysis::Tokenizer;
+
+our %fields = ( content => 'MatchSchema::MatchOnly', );
+
+sub analyzer { KinoSearch::Analysis::Tokenizer->new }
+
+package main;
+
+use KinoSearch::Test::TestUtils qw( get_uscon_docs );
+use KinoSearch::Test::TestSchema;
+use Test::More tests => 6;
+
+my $uscon_docs = get_uscon_docs();
+my $match_invindex = make_index( MatchSchema->new, $uscon_docs );
+my $score_invindex
+ = make_index( KinoSearch::Test::TestSchema->new, $uscon_docs );
+
+my $match_searcher = KinoSearch::Searcher->new( invindex => $match_invindex );
+my $score_searcher = KinoSearch::Searcher->new( invindex => $score_invindex );
+
+for (qw( land of the free )) {
+ my $match_got = hit_nums_array( $match_searcher, $_ );
+ my $score_got = hit_nums_array( $score_searcher, $_ );
+ is_deeply( $match_got, $score_got, "same hits for '$_'" );
+}
+
+my $qstring = '"the legislature"';
+my $should_have_hits = hit_nums_array( $score_searcher, $qstring );
+my $should_be_empty = hit_nums_array( $match_searcher, $qstring );
+ok( scalar @$should_have_hits, "successfully scored phrase $qstring" );
+ok( !scalar @$should_be_empty, "no hits matched for phrase $qstring" );
+
+sub make_index {
+ my ( $schema, $docs ) = @_;
+ my $folder = KinoSearch::Store::RAMFolder->new;
+ my $invindex = KinoSearch::InvIndex->clobber(
+ schema => $schema,
+ folder => $folder,
+ );
+
+ my $invindexer = KinoSearch::InvIndexer->new( invindex => $invindex, );
+ $invindexer->add_doc( { content => $_->{bodytext} } ) for values %$docs;
+ $invindexer->finish;
+ return $invindex;
+}
+
+sub hit_nums_array {
+ my ( $searcher, $query_string ) = @_;
+ my $query = $searcher->glean_query($query_string);
+
+ my $bit_vec = KinoSearch::Util::BitVector->new(
+ capacity => $searcher->max_docs + 1 );
+ my $bit_collector = KinoSearch::Search::HitCollector::BitCollector->new(
+ bit_vector => $bit_vec, );
+ $searcher->collect( query => $query, collector => $bit_collector );
+ return $bit_vec->to_array->to_arrayref;
+}
+


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits