Mailing List Archive

r3807 - in trunk: c_src/KinoSearch/Index c_src/KinoSearch/Search perl perl/lib/KinoSearch/Index perl/t
Author: creamyg
Date: 2008-08-31 00:04:09 -0700 (Sun, 31 Aug 2008)
New Revision: 3807

Added:
trunk/c_src/KinoSearch/Index/DelEnum.bp
trunk/c_src/KinoSearch/Index/DelEnum.c
trunk/perl/lib/KinoSearch/Index/DelEnum.pm
trunk/perl/t/219-del_enum.t
Modified:
trunk/c_src/KinoSearch/Search/MatchAllScorer.bp
trunk/c_src/KinoSearch/Search/MatchAllScorer.c
trunk/c_src/KinoSearch/Search/NOTScorer.bp
trunk/c_src/KinoSearch/Search/NOTScorer.c
trunk/perl/MANIFEST
trunk/perl/t/525-match_all_query.t
Log:
Add DelEnum, an iterator for deleted doc nums. Integrate it into NOTQuery and
MatchAllQuery, eliminating calls to IxReader_Is_Deleted().


Added: trunk/c_src/KinoSearch/Index/DelEnum.bp
===================================================================
--- trunk/c_src/KinoSearch/Index/DelEnum.bp (rev 0)
+++ trunk/c_src/KinoSearch/Index/DelEnum.bp 2008-08-31 07:04:09 UTC (rev 3807)
@@ -0,0 +1,43 @@
+parcel KinoSearch cnick Kino;
+
+/** Iterator for deleted document numbers.
+ */
+class KinoSearch::Index::DelEnum extends KinoSearch::Obj {
+
+ I32Array *seg_starts;
+ VArray *bit_vectors;
+ BitVector *current;
+ i32_t seg_tick;
+ i32_t doc_num;
+ i32_t num_segs;
+ i32_t current_offset;
+ i32_t next_seg_start;
+ i32_t max_docs;
+
+ public static incremented DelEnum*
+ new(IndexReader *reader);
+
+ public static DelEnum*
+ init(DelEnum *self, IndexReader *reader);
+
+ /* Internal constructor for testing. */
+ public static DelEnum*
+ init2(DelEnum *self, VArray *bit_vectors, I32Array *seg_starts,
+ i32_t max_docs);
+
+ public i32_t
+ Get_Doc_Num(DelEnum *self);
+
+ public i32_t
+ Next(DelEnum *self);
+
+ i32_t
+ Skip_To(DelEnum *self, i32_t target);
+}
+
+/* Copyright 2008 Marvin Humphrey
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * under the same terms as Perl itself.
+ */
+

Added: trunk/c_src/KinoSearch/Index/DelEnum.c
===================================================================
--- trunk/c_src/KinoSearch/Index/DelEnum.c (rev 0)
+++ trunk/c_src/KinoSearch/Index/DelEnum.c 2008-08-31 07:04:09 UTC (rev 3807)
@@ -0,0 +1,117 @@
+#include "KinoSearch/Util/ToolSet.h"
+
+#include "KinoSearch/Index/DelEnum.h"
+#include "KinoSearch/Index/IndexReader.h"
+#include "KinoSearch/Index/SegReader.h"
+#include "KinoSearch/Util/BitVector.h"
+#include "KinoSearch/Util/I32Array.h"
+
+DelEnum*
+DelEnum_new(IndexReader *reader)
+{
+ DelEnum *self = (DelEnum*)CREATE(NULL, DELENUM);
+ return DelEnum_init(self, reader);
+}
+
+DelEnum*
+DelEnum_init(DelEnum *self, IndexReader *reader)
+{
+ u32_t i, max;
+ VArray *seg_readers = IxReader_SegReaders(reader);
+ I32Array *seg_starts = IxReader_Seg_Starts(reader);
+ VArray *bit_vectors = VA_new(seg_readers->size);
+
+ for (i = 0, max = seg_readers->size; i < max; i++) {
+ SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i);
+ BitVector *deldocs = (BitVector*)SegReader_Get_DelDocs(seg_reader);
+ VA_Push(bit_vectors, (Obj*)deldocs);
+ }
+
+ DelEnum_init2(self, bit_vectors, seg_starts, IxReader_Max_Docs(reader));
+
+ REFCOUNT_DEC(seg_readers);
+ REFCOUNT_DEC(seg_starts);
+ REFCOUNT_DEC(bit_vectors);
+ return self;
+}
+
+DelEnum*
+DelEnum_init2(DelEnum *self, VArray *bit_vectors, I32Array *seg_starts,
+ i32_t max_docs)
+{
+ self->current = NULL;
+ self->current_offset = 0;
+ self->next_seg_start = 0;
+ self->doc_num = 0;
+ self->seg_tick = 0;
+ self->max_docs = max_docs;
+ self->num_segs = (i32_t)I32Arr_Get_Size(seg_starts);
+ self->seg_starts = REFCOUNT_INC(seg_starts);
+ self->bit_vectors = REFCOUNT_INC(bit_vectors);
+ return self;
+
+}
+
+void
+DelEnum_destroy(DelEnum *self)
+{
+ REFCOUNT_DEC(self->seg_starts);
+ REFCOUNT_DEC(self->bit_vectors);
+ FREE_OBJ(self);
+}
+
+i32_t
+DelEnum_next(DelEnum *self)
+{
+ return DelEnum_skip_to(self, self->doc_num + 1);
+}
+
+i32_t
+DelEnum_skip_to(DelEnum *self, i32_t target)
+{
+ if (target >= self->next_seg_start) {
+ /* Proceed to next segment or bail. */
+ if (self->seg_tick < self->num_segs) {
+ u32_t next_seg_start = self->seg_tick + 1 == self->num_segs
+ ? self->max_docs + 1
+ : I32Arr_Get(self->seg_starts, self->seg_tick + 1);
+ REFCOUNT_DEC(self->current);
+ self->current = (BitVector*)VA_Fetch(self->bit_vectors,
+ self->seg_tick);
+ self->current_offset = self->next_seg_start;
+ self->next_seg_start = next_seg_start;
+ self->doc_num = next_seg_start - 1;
+ self->seg_tick++;
+ return DelEnum_skip_to(self, target); /* Recurse. */
+ }
+ else {
+ /* We're done. */
+ self->doc_num = self->max_docs + 1;
+ return 0;
+ }
+ }
+ else {
+ i32_t target_minus_offset = target - self->current_offset;
+ i32_t found = BitVec_Next_Set_Bit(self->current, target_minus_offset);
+ if (found == -1) {
+ return DelEnum_skip_to(self, self->next_seg_start); /* Recurse. */
+ }
+ else {
+ self->doc_num = found + self->current_offset;
+ return self->doc_num;
+ }
+ }
+}
+
+i32_t
+DelEnum_get_doc_num(DelEnum *self)
+{
+ return self->doc_num;
+}
+
+/* Copyright 2007-2008 Marvin Humphrey
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * under the same terms as Perl itself.
+ */
+

Modified: trunk/c_src/KinoSearch/Search/MatchAllScorer.bp
===================================================================
--- trunk/c_src/KinoSearch/Search/MatchAllScorer.bp 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/c_src/KinoSearch/Search/MatchAllScorer.bp 2008-08-31 07:04:09 UTC (rev 3807)
@@ -2,10 +2,11 @@

class KinoSearch::Search::MatchAllScorer extends KinoSearch::Search::Scorer {

- i32_t doc_num;
- i32_t max_docs;
- IndexReader *reader;
- Tally *tally;
+ i32_t doc_num;
+ i32_t max_docs;
+ i32_t next_deletion;
+ DelEnum *del_enum;
+ Tally *tally;

/**
* @param score Fixed score to be added to each matching document's score.

Modified: trunk/c_src/KinoSearch/Search/MatchAllScorer.c
===================================================================
--- trunk/c_src/KinoSearch/Search/MatchAllScorer.c 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/c_src/KinoSearch/Search/MatchAllScorer.c 2008-08-31 07:04:09 UTC (rev 3807)
@@ -3,6 +3,7 @@
#include "KinoSearch/Search/MatchAllScorer.h"
#include "KinoSearch/InvIndex.h"
#include "KinoSearch/Schema.h"
+#include "KinoSearch/Index/DelEnum.h"
#include "KinoSearch/Index/IndexReader.h"
#include "KinoSearch/Search/Similarity.h"
#include "KinoSearch/Search/Tally.h"
@@ -21,17 +22,22 @@
Scorer_init((Scorer*)self, NULL);

/* Init. */
- self->doc_num = 0;
- self->tally = Tally_new();
+ self->doc_num = 0;
+ self->tally = Tally_new();

/* Assign. */
self->tally->score = score;
- self->reader = IxReader_Has_Deletions(reader)
- ? REFCOUNT_INC(reader)
- : NULL;

/* Derive. */
self->max_docs = IxReader_Max_Docs(reader);
+ if (IxReader_Has_Deletions(reader)) {
+ self->next_deletion = 0;
+ self->del_enum = DelEnum_new(reader);
+ }
+ else {
+ self->next_deletion = self->max_docs + 1;
+ self->del_enum = NULL;
+ }

return self;
}
@@ -39,7 +45,7 @@
void
MatchAllScorer_destroy(MatchAllScorer *self)
{
- REFCOUNT_DEC(self->reader);
+ REFCOUNT_DEC(self->del_enum);
REFCOUNT_DEC(self->tally);
Scorer_destroy((Scorer*)self);
}
@@ -52,8 +58,11 @@
self->doc_num--;
return 0;
}
- } while ( self->reader != NULL
- && IxReader_Is_Deleted(self->reader, self->doc_num));
+ if (self->doc_num > self->next_deletion) {
+ self->next_deletion
+ = DelEnum_Skip_To(self->del_enum, self->doc_num);
+ }
+ } while (self->doc_num == self->next_deletion);
return self->doc_num;
}


Modified: trunk/c_src/KinoSearch/Search/NOTScorer.bp
===================================================================
--- trunk/c_src/KinoSearch/Search/NOTScorer.bp 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/c_src/KinoSearch/Search/NOTScorer.bp 2008-08-31 07:04:09 UTC (rev 3807)
@@ -5,12 +5,13 @@

class KinoSearch::Search::NOTScorer extends KinoSearch::Search::PolyScorer {

- Scorer *negated_scorer;
- IndexReader *reader;
- Tally *tally;
- i32_t doc_num;
- i32_t max_docs;
- i32_t next_negation;
+ Scorer *negated_scorer;
+ DelEnum *del_enum;
+ Tally *tally;
+ i32_t doc_num;
+ i32_t max_docs;
+ i32_t next_negation;
+ i32_t next_deletion;

static incremented NOTScorer*
new(Scorer* negated_scorer, IndexReader *reader);

Modified: trunk/c_src/KinoSearch/Search/NOTScorer.c
===================================================================
--- trunk/c_src/KinoSearch/Search/NOTScorer.c 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/c_src/KinoSearch/Search/NOTScorer.c 2008-08-31 07:04:09 UTC (rev 3807)
@@ -3,6 +3,7 @@
#include "KinoSearch/Search/NOTScorer.h"
#include "KinoSearch/InvIndex.h"
#include "KinoSearch/Schema.h"
+#include "KinoSearch/Index/DelEnum.h"
#include "KinoSearch/Index/IndexReader.h"
#include "KinoSearch/Search/Similarity.h"
#include "KinoSearch/Search/Tally.h"
@@ -25,12 +26,17 @@

/* Assign. */
self->negated_scorer = REFCOUNT_INC(negated_scorer);
- self->reader = IxReader_Has_Deletions(reader)
- ? REFCOUNT_INC(reader)
- : NULL;

/* Derive. */
self->max_docs = IxReader_Max_Docs(reader);
+ if (IxReader_Has_Deletions(reader)) {
+ self->del_enum = DelEnum_new(reader);
+ self->next_deletion = 0;
+ }
+ else {
+ self->del_enum = NULL;
+ self->next_deletion = self->max_docs + 1;
+ }

/* Init. */
self->doc_num = 0;
@@ -44,7 +50,7 @@
NOTScorer_destroy(NOTScorer *self)
{
REFCOUNT_DEC(self->negated_scorer);
- REFCOUNT_DEC(self->reader);
+ REFCOUNT_DEC(self->del_enum);
REFCOUNT_DEC(self->tally);
PolyScorer_destroy((PolyScorer*)self);
}
@@ -71,9 +77,11 @@
return 0;
}
else if (self->doc_num != self->next_negation) {
- if ( self->reader == NULL
- || !IxReader_Is_Deleted(self->reader, self->doc_num)
- ) {
+ if (self->doc_num > self->next_deletion) {
+ self->next_deletion
+ = DelEnum_Skip_To(self->del_enum, self->doc_num);
+ }
+ if (self->doc_num != self->next_deletion) {
/* Success! */
return self->doc_num;
}

Modified: trunk/perl/MANIFEST
===================================================================
--- trunk/perl/MANIFEST 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/perl/MANIFEST 2008-08-31 07:04:09 UTC (rev 3807)
@@ -81,6 +81,7 @@
lib/KinoSearch/Highlight/HeatMap.pm
lib/KinoSearch/Highlight/Highlighter.pm
lib/KinoSearch/Index/DelDocs.pm
+lib/KinoSearch/Index/DelEnum.pm
lib/KinoSearch/Index/DocReader.pm
lib/KinoSearch/Index/DocVector.pm
lib/KinoSearch/Index/DocWriter.pm
@@ -338,6 +339,7 @@
t/216-schema.t
t/217-multi_lexicon.t
t/218-del_merging.t
+t/219-del_enum.t
t/302-many_fields.t
t/303-highlighter.t
t/304-verify_utf8.t

Added: trunk/perl/lib/KinoSearch/Index/DelEnum.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Index/DelEnum.pm (rev 0)
+++ trunk/perl/lib/KinoSearch/Index/DelEnum.pm 2008-08-31 07:04:09 UTC (rev 3807)
@@ -0,0 +1,22 @@
+use KinoSearch;
+
+1;
+
+__END__
+
+__AUTO_XS__
+
+{ "KinoSearch::Index::DelEnum" => {
+ bind_methods => [qw( Next Skip_To Get_Doc_Num )],
+ make_constructors => [qw( new new2|init2 )],
+ }
+}
+
+__COPYRIGHT__
+
+Copyright 2008 Marvin Humphrey
+
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.
+
+

Added: trunk/perl/t/219-del_enum.t
===================================================================
--- trunk/perl/t/219-del_enum.t (rev 0)
+++ trunk/perl/t/219-del_enum.t 2008-08-31 07:04:09 UTC (rev 3807)
@@ -0,0 +1,71 @@
+use strict;
+use warnings;
+
+use Test::More tests => 135;
+use KinoSearch::Index::DelEnum;
+use KinoSearch::Util::I32Array;
+use KinoSearch::Util::BitVector;
+
+sub make_del_enum {
+ my ( $deletions, $seg_starts, $max_docs ) = @_;
+ my $tick = 0;
+ my @bit_vectors;
+ my @check_dels;
+ for ( my $i = 0; $i < @$seg_starts; $i++ ) {
+ my $offset = $seg_starts->[$i];
+ my $max = $seg_starts->[ $i + 1 ] || $max_docs + 1;
+ my $bit_vec
+ = KinoSearch::Util::BitVector->new( capacity => $max - $offset );
+ while ( $tick < @$deletions ) {
+ my $del_num = $deletions->[$tick];
+ last if $del_num >= $max;
+ $tick++;
+ $bit_vec->set( $del_num - $offset );
+ }
+ push @bit_vectors, $bit_vec;
+ }
+ for ( my $i = 0; $i < @$seg_starts; $i++ ) {
+ my $offset = $seg_starts->[$i];
+ my $bit_vec = $bit_vectors[$i];
+ push @check_dels,
+ map { $_ + $offset } @{ $bit_vec->to_array->to_arrayref };
+ }
+ #is_deeply(\@check_dels, $deletions, "test the setup");
+
+ return KinoSearch::Index::DelEnum->new2(
+ bit_vectors => \@bit_vectors,
+ max_docs => $max_docs,
+ seg_starts => KinoSearch::Util::I32Array->new( ints => $seg_starts ),
+ );
+}
+
+for my $max_docs ( 10, 100, 1000 ) {
+ for my $first_del ( 1, 2, 10 ) {
+ for my $del_inc ( 20, 13, 9, 4, 2 ) {
+ my $del_num = $first_del;
+ my @deletions;
+ while ( $del_num <= $max_docs ) {
+ push @deletions, $del_num;
+ $del_num += $del_inc;
+ }
+ for my $seg_inc ( 7, 29, 71 ) {
+ my @seg_starts;
+ my $seg_start = 0;
+ while ( $seg_start <= $max_docs ) {
+ push @seg_starts, $seg_start;
+ $seg_start += $seg_inc;
+ }
+ my $del_enum
+ = make_del_enum( \@deletions, \@seg_starts, $max_docs );
+ my @got;
+ while ( my $del = $del_enum->next ) {
+ push @got, $del;
+ }
+ is_deeply( \@got, \@deletions,
+ "matrix max_docs=$max_docs first_del=$first_del "
+ . "del_inc=$del_inc seg_inc=$seg_inc" );
+ }
+ }
+ }
+}
+

Modified: trunk/perl/t/525-match_all_query.t
===================================================================
--- trunk/perl/t/525-match_all_query.t 2008-08-31 06:41:01 UTC (rev 3806)
+++ trunk/perl/t/525-match_all_query.t 2008-08-31 07:04:09 UTC (rev 3807)
@@ -26,7 +26,7 @@

$searcher = KinoSearch::Searcher->new( invindex => $invindex );
$hits = $searcher->search( query => $match_all_query, num_wanted => 100 );
-is( $hits->total_hits, 25, "match all" );
+is( $hits->total_hits, 25, "match all minus a deletion" );
my @got;
while ( my $hit = $hits->next ) {
push @got, $hit->{content};


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits