Mailing List Archive

r3723 - in trunk: c_src/KinoSearch c_src/KinoSearch/Analysis c_src/KinoSearch/Index c_src/KinoSearch/Posting devel/benchmarks/indexers perl perl/buildlib perl/lib perl/lib/KinoSearch/Analysis perl/t perl/xs/KinoSearch/Analysis
Author: creamyg
Date: 2008-08-05 10:33:59 -0700 (Tue, 05 Aug 2008)
New Revision: 3723

Added:
trunk/c_src/KinoSearch/Analysis/Inversion.bp
trunk/c_src/KinoSearch/Analysis/Inversion.c
trunk/perl/lib/KinoSearch/Analysis/Inversion.pm
trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
trunk/perl/t/152-inversion.t
Removed:
trunk/c_src/KinoSearch/Analysis/TokenBatch.bp
trunk/c_src/KinoSearch/Analysis/TokenBatch.c
trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
trunk/perl/t/152-token_batch.t
Modified:
trunk/c_src/KinoSearch/Analysis/Analyzer.bp
trunk/c_src/KinoSearch/Analysis/Analyzer.c
trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
trunk/c_src/KinoSearch/Analysis/LCNormalizer.c
trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c
trunk/c_src/KinoSearch/Analysis/Stemmer.bp
trunk/c_src/KinoSearch/Analysis/Stemmer.c
trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
trunk/c_src/KinoSearch/Analysis/Stopalizer.c
trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
trunk/c_src/KinoSearch/Analysis/Tokenizer.c
trunk/c_src/KinoSearch/Index/Inverter.bp
trunk/c_src/KinoSearch/Index/Inverter.c
trunk/c_src/KinoSearch/Index/PostingPool.bp
trunk/c_src/KinoSearch/Index/PostingPool.c
trunk/c_src/KinoSearch/Index/PostingsWriter.bp
trunk/c_src/KinoSearch/Index/PostingsWriter.c
trunk/c_src/KinoSearch/Index/TermVectorsWriter.bp
trunk/c_src/KinoSearch/Index/TermVectorsWriter.c
trunk/c_src/KinoSearch/Posting.bp
trunk/c_src/KinoSearch/Posting/RichPosting.bp
trunk/c_src/KinoSearch/Posting/RichPosting.c
trunk/c_src/KinoSearch/Posting/ScorePosting.bp
trunk/c_src/KinoSearch/Posting/ScorePosting.c
trunk/devel/benchmarks/indexers/BenchmarkingIndexer.pm
trunk/perl/MANIFEST
trunk/perl/buildlib/KinoTestUtils.pm
trunk/perl/lib/KinoSearch.pm
trunk/perl/t/150-polyanalyzer.t
trunk/perl/t/153-lc_normalizer.t
trunk/perl/t/154-tokenizer.t
trunk/perl/t/605-store_pos_boost.t
trunk/perl/xs/KinoSearch/Analysis/LCNormalizer.c
trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c
Log:
Rename TokenBatch to Inversion.


Modified: trunk/c_src/KinoSearch/Analysis/Analyzer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Analyzer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Analyzer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -13,19 +13,19 @@
static Analyzer*
init(Analyzer *self);

- /** Take a single L<TokenBatch|KinoSearch::Analysis::TokenBatch> as input
- * and returns a TokenBatch, either the same one (presumably transformed
+ /** Take a single L<Inversion|KinoSearch::Analysis::Inversion> as input
+ * and returns an Inversion, either the same one (presumably transformed
* in some way), or a new one.
*/
- public abstract incremented TokenBatch*
- Transform(Analyzer *self, TokenBatch *batch);
+ public abstract incremented Inversion*
+ Transform(Analyzer *self, Inversion *inversion);

- /** Kick off an analysis chain, creating a TokenBatch from string input.
- * The default implementation simply creates an initial TokenBatch with a
+ /** Kick off an analysis chain, creating an Inversion from string input.
+ * The default implementation simply creates an initial Inversion with a
* single Token, then calls Transform(), but occasionally subclasses will
* provide an optimized implementation which minimizes string copies.
*/
- public incremented TokenBatch*
+ public incremented Inversion*
Transform_Text(Analyzer *self, CharBuf *text);

/** Analyze text and return an array of token texts.

Modified: trunk/c_src/KinoSearch/Analysis/Analyzer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Analyzer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Analyzer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/Analyzer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"

Analyzer*
Analyzer_init(Analyzer *self)
@@ -11,33 +11,33 @@
return self;
}

-TokenBatch*
+Inversion*
Analyzer_transform_text(Analyzer *self, CharBuf *text)
{
size_t token_len = CB_Get_Size(text);
Token *seed = Token_new(text->ptr, token_len, 0, token_len, 1.0, 1);
- TokenBatch *starter_batch = TokenBatch_new(seed);
- TokenBatch *retval = Analyzer_Transform(self, starter_batch);
+ Inversion *starter = Inversion_new(seed);
+ Inversion *retval = Analyzer_Transform(self, starter);
REFCOUNT_DEC(seed);
- REFCOUNT_DEC(starter_batch);
+ REFCOUNT_DEC(starter);
return retval;
}

VArray*
Analyzer_split(Analyzer *self, CharBuf *text)
{
- TokenBatch *batch = Analyzer_Transform_Text(self, text);
- VArray *out = VA_new(0);
+ Inversion *inversion = Analyzer_Transform_Text(self, text);
+ VArray *out = VA_new(0);
Token *token;

- while ((token = Kino_TokenBatch_Next(batch)) != NULL) {
+ while ((token = Inversion_Next(inversion)) != NULL) {
CharBuf *token_text
= CB_new_from_trusted_utf8(token->text, token->len);
VA_Push(out, (Obj*)token_text);
REFCOUNT_DEC(token_text);
}

- REFCOUNT_DEC(batch);
+ REFCOUNT_DEC(inversion);

return out;
}

Copied: trunk/c_src/KinoSearch/Analysis/Inversion.bp (from rev 3722, trunk/c_src/KinoSearch/Analysis/TokenBatch.bp)
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Inversion.bp (rev 0)
+++ trunk/c_src/KinoSearch/Analysis/Inversion.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -0,0 +1,64 @@
+parcel KinoSearch cnick Kino;
+
+/**
+ * A collection of Tokens.
+ *
+ * An Inversion is a collection of Token objects which you can add to, then
+ * iterate over.
+ */
+class KinoSearch::Analysis::Inversion extends KinoSearch::Util::VArray {
+
+ u32_t cur; /* pointer to current token */
+ bool_t inverted; /* inversion has been inverted */
+ u32_t *cluster_counts; /* counts per unique text */
+ u32_t cluster_counts_size; /* num unique texts */
+
+ /**
+ * @param seed An initial Token to start things off, which may be NULL.
+ */
+ static incremented Inversion*
+ new(Token *seed = NULL);
+
+ /** Tack a token onto the end of the Inversion.
+ *
+ * @param token A Token.
+ */
+ void
+ Append(Inversion *self, Token *token);
+
+ /** Return the next token in the Inversion until out of tokens.
+ */
+ Token*
+ Next(Inversion *self);
+
+ /** Reset the Inversion's iterator, so that the next call to next()
+ * returns the first Token in the inversion.
+ */
+ void
+ Reset(Inversion *self);
+
+ /** Assign positions to constituent Tokens, tallying up the position
+ * increments. Sort the tokens first by token text and then by position
+ * ascending.
+ */
+ void
+ Invert(Inversion *self);
+
+ /** Return a pointer to the next group of like Tokens. The number of tokens
+ * in the cluster will be placed into [count].
+ *
+ * @param count The number of tokens in the cluster.
+ */
+ Token**
+ Next_Cluster(Inversion *self, u32_t *count);
+
+ void
+ Destroy(Inversion *self);
+}
+
+/* Copyright 2006-2008 Marvin Humphrey
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * under the same terms as Perl itself.
+ */
+

Copied: trunk/c_src/KinoSearch/Analysis/Inversion.c (from rev 3712, trunk/c_src/KinoSearch/Analysis/TokenBatch.c)
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Inversion.c (rev 0)
+++ trunk/c_src/KinoSearch/Analysis/Inversion.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -0,0 +1,172 @@
+#include "KinoSearch/Util/ToolSet.h"
+
+#include "KinoSearch/Analysis/Inversion.h"
+#include "KinoSearch/Analysis/Token.h"
+
+/* After inversion, record how many like tokens occur in each group.
+ */
+static void
+count_clusters(Inversion *self);
+
+Inversion*
+Inversion_new(Token *seed_token)
+{
+ Inversion *self = (Inversion*)CREATE(NULL, INVERSION);
+
+ /* Init. */
+ VA_init((VArray*)self, 16);
+ self->cur = 0;
+ self->inverted = false;
+ self->cluster_counts = NULL;
+ self->cluster_counts_size = 0;
+
+ /* Process the seed token. */
+ if (seed_token != NULL)
+ Inversion_append(self, seed_token);
+
+ return self;
+}
+
+void
+Inversion_destroy(Inversion *self)
+{
+ free(self->cluster_counts);
+ VA_destroy((VArray*)self);
+}
+
+Token*
+Inversion_next(Inversion *self)
+{
+ /* Kill the iteration if we're out of tokens. */
+ if (self->cur == self->size)
+ return NULL;
+ return (Token*)self->elems[ self->cur++ ];
+}
+
+void
+Inversion_reset(Inversion *self)
+{
+ self->cur = 0;
+}
+
+void
+Inversion_append(Inversion *self, Token *token)
+{
+ /* Safety check. */
+ if (self->inverted)
+ CONFESS("Can't append tokens after inversion");
+
+ /* Minimize reallocations. */
+ if (self->size >= self->cap) {
+ if (self->cap < 100) {
+ VA_Grow(self, 100);
+ }
+ else if (self->size < 10000) {
+ VA_Grow(self, self->cap * 2);
+ }
+ else {
+ VA_Grow(self, self->cap + 10000);
+ }
+ }
+
+ /* Inlined VA_Push. */
+ self->elems[ self->size ] = (Obj*)REFCOUNT_INC(token);
+ self->size++;
+}
+
+Token**
+Inversion_next_cluster(Inversion *self, u32_t *count)
+{
+ Token **cluster = (Token**)(self->elems + self->cur);
+
+ if (self->cur == self->size) {
+ *count = 0;
+ return NULL;
+ }
+
+ /* Don't read past the end of the cluster counts array. */
+ if (!self->inverted)
+ CONFESS("Inversion not yet inverted");
+ if (self->cur > self->cluster_counts_size)
+ CONFESS("Tokens were added after inversion");
+
+ /* Place cluster count in passed-in var, advance bookmark. */
+ *count = self->cluster_counts[ self->cur ];
+ self->cur += *count;
+
+ return cluster;
+}
+
+void
+Inversion_invert(Inversion *self)
+{
+ Token **tokens = (Token**)self->elems;
+ Token **limit = tokens + self->size;
+ i32_t token_pos = 0;
+
+ /* Thwart future attempts to append. */
+ if (self->inverted)
+ CONFESS("Inversion has already been inverted");
+ self->inverted = true;
+
+ /* Assign token positions. */
+ for ( ;tokens < limit; tokens++) {
+ Token *const cur_token = *tokens;
+ cur_token->pos = token_pos;
+ token_pos += cur_token->pos_inc;
+ if (token_pos < cur_token->pos) {
+ CONFESS("Token positions out of order: %i32 %i32",
+ cur_token->pos, token_pos);
+ }
+ }
+
+ /* Sort the tokens lexically, and hand off to cluster counting routine. */
+ qsort(self->elems, self->size, sizeof(Token*), Token_compare);
+ count_clusters(self);
+}
+
+static void
+count_clusters(Inversion *self)
+{
+ Token **tokens = (Token**)self->elems;
+ u32_t *counts = CALLOCATE(self->size + 1, u32_t);
+ u32_t i;
+
+ /* Save the cluster counts. */
+ self->cluster_counts_size = self->size;
+ self->cluster_counts = counts;
+
+ for (i = 0; i < self->size; ) {
+ Token *const base_token = tokens[i];
+ char *const base_text = base_token->text;
+ const size_t base_len = base_token->len;
+ u32_t j = i + 1;
+
+ /* Iterate through tokens until text doesn't match. */
+ while (j < self->size) {
+ Token *const candidate = tokens[j];
+
+ if ( (candidate->len == base_len)
+ && (memcmp(candidate->text, base_text, base_len) == 0)
+ ) {
+ j++;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* Record a count at the position of the first token in the cluster. */
+ counts[i] = j - i;
+
+ /* Start the next loop at the next token we haven't seen. */
+ i = j;
+ }
+}
+
+/* Copyright 2006-2008 Marvin Humphrey
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * under the same terms as Perl itself.
+ */
+

Modified: trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -11,10 +11,10 @@
static LCNormalizer*
init(LCNormalizer *self);

- public incremented TokenBatch*
- Transform(LCNormalizer *self, TokenBatch *batch);
+ public incremented Inversion*
+ Transform(LCNormalizer *self, Inversion *inversion);

- public incremented TokenBatch*
+ public incremented Inversion*
Transform_Text(LCNormalizer *self, CharBuf *text);

void

Modified: trunk/c_src/KinoSearch/Analysis/LCNormalizer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/LCNormalizer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/LCNormalizer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/LCNormalizer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/ByteBuf.h"
#include "KinoSearch/Util/Native.h"


Modified: trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -12,10 +12,10 @@
init(PolyAnalyzer *self, const CharBuf *language = NULL,
VArray *analyzers = NULL);

- public incremented TokenBatch*
- Transform(PolyAnalyzer *self, TokenBatch *batch);
+ public incremented Inversion*
+ Transform(PolyAnalyzer *self, Inversion *inversion);

- public incremented TokenBatch*
+ public incremented Inversion*
Transform_Text(PolyAnalyzer *self, CharBuf *text);

void

Modified: trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/PolyAnalyzer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Analysis/LCNormalizer.h"
#include "KinoSearch/Analysis/Stemmer.h"
#include "KinoSearch/Analysis/Tokenizer.h"
@@ -41,34 +41,34 @@
return self;
}

-TokenBatch*
-PolyAnalyzer_transform(PolyAnalyzer *self, TokenBatch *batch)
+Inversion*
+PolyAnalyzer_transform(PolyAnalyzer *self, Inversion *inversion)
{
VArray *const analyzers = self->analyzers;
u32_t i;
- REFCOUNT_INC(batch);
+ REFCOUNT_INC(inversion);

/* Iterate through each of the analyzers in order. */
for (i = 0; i < analyzers->size; i++) {
Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i);
- TokenBatch *new_batch = Analyzer_Transform(analyzer, batch);
- REFCOUNT_DEC(batch);
- batch = new_batch;
+ Inversion *new_inversion = Analyzer_Transform(analyzer, inversion);
+ REFCOUNT_DEC(inversion);
+ inversion = new_inversion;
}

- return batch;
+ return inversion;
}

-TokenBatch*
+Inversion*
PolyAnalyzer_transform_text(PolyAnalyzer *self, CharBuf *text)
{
VArray *const analyzers = self->analyzers;
- TokenBatch *retval;
+ Inversion *retval;

if (analyzers->size == 0) {
size_t token_len = CB_Get_Size(text);
Token *seed = Token_new(text->ptr, token_len, 0, token_len, 1.0f, 1);
- retval = TokenBatch_new(seed);
+ retval = Inversion_new(seed);
REFCOUNT_DEC(seed);
}
else {
@@ -77,9 +77,9 @@
retval = Analyzer_Transform_Text(first_analyzer, text);
for (i = 1; i < analyzers->size; i++) {
Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i);
- TokenBatch *new_batch = Analyzer_Transform(analyzer, retval);
+ Inversion *new_inversion = Analyzer_Transform(analyzer, retval);
REFCOUNT_DEC(retval);
- retval = new_batch;
+ retval = new_inversion;
}
}


Modified: trunk/c_src/KinoSearch/Analysis/Stemmer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -10,8 +10,8 @@
static Stemmer*
init(Stemmer *self, const CharBuf *language);

- public incremented TokenBatch*
- Transform(Stemmer *self, TokenBatch *batch);
+ public incremented Inversion*
+ Transform(Stemmer *self, Inversion *inversion);

void
Destroy(Stemmer *self);

Modified: trunk/c_src/KinoSearch/Analysis/Stemmer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stemmer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Stemmer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -3,7 +3,7 @@

#include "KinoSearch/Analysis/Stemmer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/Native.h"

/* Declare Snowball interface -- since it's so simple we don't need to include
@@ -46,15 +46,15 @@
return self;
}

-TokenBatch*
-Stemmer_transform(Stemmer *self, TokenBatch *batch)
+Inversion*
+Stemmer_transform(Stemmer *self, Inversion *inversion)
{
Token *token;
struct sb_stemmer *const snowstemmer = self->snowstemmer;

- while (NULL != (token = TokenBatch_Next(batch))) {
- sb_symbol *stemmed_text
- = sb_stemmer_stem(snowstemmer, (sb_symbol*)token->text, token->len);
+ while (NULL != (token = Inversion_Next(inversion))) {
+ sb_symbol *stemmed_text = sb_stemmer_stem(snowstemmer,
+ (sb_symbol*)token->text, token->len);
size_t len = sb_stemmer_length(snowstemmer);
if (len > token->len) {
free(token->text);
@@ -63,8 +63,8 @@
memcpy(stemmed_text, token->text, len + 1);
token->len = len;
}
- TokenBatch_Reset(batch);
- return REFCOUNT_INC(batch);
+ Inversion_Reset(inversion);
+ return REFCOUNT_INC(inversion);
}

void

Modified: trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -11,8 +11,8 @@
init(Stopalizer *self, const CharBuf *language = NULL,
Hash *stoplist = NULL);

- public incremented TokenBatch*
- Transform(Stopalizer *self, TokenBatch *batch);
+ public incremented Inversion*
+ Transform(Stopalizer *self, Inversion *inversion);

void
Destroy(Stopalizer *self);

Modified: trunk/c_src/KinoSearch/Analysis/Stopalizer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stopalizer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Stopalizer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/Stopalizer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/Native.h"

Stopalizer*
@@ -33,20 +33,20 @@
return self;
}

-TokenBatch*
-Stopalizer_transform(Stopalizer *self, TokenBatch *batch)
+Inversion*
+Stopalizer_transform(Stopalizer *self, Inversion *inversion)
{
Token *token;
- TokenBatch *new_batch = TokenBatch_new(NULL);
+ Inversion *new_inversion = Inversion_new(NULL);
Hash *const stoplist = self->stoplist;

- while (NULL != (token = TokenBatch_Next(batch))) {
+ while (NULL != (token = Inversion_Next(inversion))) {
if (!Hash_Fetch_Str(stoplist, token->text, token->len)) {
- Kino_TokenBatch_Append(new_batch, token);
+ Inversion_Append(new_inversion, token);
}
}

- return new_batch;
+ return new_inversion;
}

void

Deleted: trunk/c_src/KinoSearch/Analysis/TokenBatch.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/TokenBatch.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/TokenBatch.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -1,64 +0,0 @@
-parcel KinoSearch cnick Kino;
-
-/**
- * A collection of Tokens.
- *
- * A TokenBatch is a collection of Token objects which you can add to, then
- * iterate over.
- */
-class KinoSearch::Analysis::TokenBatch extends KinoSearch::Util::VArray {
-
- u32_t cur; /* pointer to current token */
- bool_t inverted; /* batch has been inverted */
- u32_t *cluster_counts; /* counts per unique text */
- u32_t cluster_counts_size; /* num unique texts */
-
- /**
- * @param seed An initial Token to start things off, which may be NULL.
- */
- static incremented TokenBatch*
- new(Token *seed = NULL);
-
- /** Tack a token onto the end of the batch
- *
- * @param token A Token.
- */
- void
- Append(TokenBatch *self, Token *token);
-
- /** Return the next token in the TokenBatch until out of tokens.
- */
- Token*
- Next(TokenBatch *self);
-
- /** Reset the TokenBatch's iterator, so that the next call to next()
- * returns the first Token in the batch.
- */
- void
- Reset(TokenBatch *self);
-
- /** Assign positions to constituent Tokens, tallying up the position
- * increments. Sort the tokens first by token text and then by position
- * ascending.
- */
- void
- Invert(TokenBatch *self);
-
- /** Return a pointer to the next group of like Tokens. The number of tokens
- * in the cluster will be placed into [count].
- *
- * @param count The number of tokens in the cluster.
- */
- Token**
- Next_Cluster(TokenBatch *self, u32_t *count);
-
- void
- Destroy(TokenBatch *self);
-}
-
-/* Copyright 2006-2008 Marvin Humphrey
- *
- * This program is free software; you can redistribute it and/or modify
- * under the same terms as Perl itself.
- */
-

Deleted: trunk/c_src/KinoSearch/Analysis/TokenBatch.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/TokenBatch.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/TokenBatch.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -1,172 +0,0 @@
-#include "KinoSearch/Util/ToolSet.h"
-
-#include "KinoSearch/Analysis/TokenBatch.h"
-#include "KinoSearch/Analysis/Token.h"
-
-/* After inversion, record how many like tokens occur in each group.
- */
-static void
-count_clusters(TokenBatch *self);
-
-TokenBatch*
-TokenBatch_new(Token *seed_token)
-{
- TokenBatch *self = (TokenBatch*)CREATE(NULL, TOKENBATCH);
-
- /* Init. */
- VA_init((VArray*)self, 16);
- self->cur = 0;
- self->inverted = false;
- self->cluster_counts = NULL;
- self->cluster_counts_size = 0;
-
- /* Process the seed token. */
- if (seed_token != NULL)
- TokenBatch_append(self, seed_token);
-
- return self;
-}
-
-void
-TokenBatch_destroy(TokenBatch *self)
-{
- free(self->cluster_counts);
- VA_destroy((VArray*)self);
-}
-
-Token*
-TokenBatch_next(TokenBatch *self)
-{
- /* Kill the iteration if we're out of tokens. */
- if (self->cur == self->size)
- return NULL;
- return (Token*)self->elems[ self->cur++ ];
-}
-
-void
-TokenBatch_reset(TokenBatch *self)
-{
- self->cur = 0;
-}
-
-void
-TokenBatch_append(TokenBatch *self, Token *token)
-{
- /* Safety check. */
- if (self->inverted)
- CONFESS("Can't append tokens after inversion");
-
- /* Minimize reallocations. */
- if (self->size >= self->cap) {
- if (self->cap < 100) {
- VA_Grow(self, 100);
- }
- else if (self->size < 10000) {
- VA_Grow(self, self->cap * 2);
- }
- else {
- VA_Grow(self, self->cap + 10000);
- }
- }
-
- /* Inlined VA_Push. */
- self->elems[ self->size ] = (Obj*)REFCOUNT_INC(token);
- self->size++;
-}
-
-Token**
-TokenBatch_next_cluster(TokenBatch *self, u32_t *count)
-{
- Token **cluster = (Token**)(self->elems + self->cur);
-
- if (self->cur == self->size) {
- *count = 0;
- return NULL;
- }
-
- /* Don't read past the end of the cluster counts array. */
- if (!self->inverted)
- CONFESS("TokenBatch not yet inverted");
- if (self->cur > self->cluster_counts_size)
- CONFESS("Tokens were added after inversion");
-
- /* Place cluster count in passed-in var, advance bookmark. */
- *count = self->cluster_counts[ self->cur ];
- self->cur += *count;
-
- return cluster;
-}
-
-void
-TokenBatch_invert(TokenBatch *self)
-{
- Token **tokens = (Token**)self->elems;
- Token **limit = tokens + self->size;
- i32_t token_pos = 0;
-
- /* Thwart future attempts to append. */
- if (self->inverted)
- CONFESS("TokenBatch has already been inverted");
- self->inverted = true;
-
- /* Assign token positions. */
- for ( ;tokens < limit; tokens++) {
- Token *const cur_token = *tokens;
- cur_token->pos = token_pos;
- token_pos += cur_token->pos_inc;
- if (token_pos < cur_token->pos) {
- CONFESS("Token positions out of order: %i32 %i32",
- cur_token->pos, token_pos);
- }
- }
-
- /* Sort the tokens lexically, and hand off to cluster counting routine. */
- qsort(self->elems, self->size, sizeof(Token*), Token_compare);
- count_clusters(self);
-}
-
-static void
-count_clusters(TokenBatch *self)
-{
- Token **tokens = (Token**)self->elems;
- u32_t *counts = CALLOCATE(self->size + 1, u32_t);
- u32_t i;
-
- /* Save the cluster counts. */
- self->cluster_counts_size = self->size;
- self->cluster_counts = counts;
-
- for (i = 0; i < self->size; ) {
- Token *const base_token = tokens[i];
- char *const base_text = base_token->text;
- const size_t base_len = base_token->len;
- u32_t j = i + 1;
-
- /* Iterate through tokens until text doesn't match. */
- while (j < self->size) {
- Token *const candidate = tokens[j];
-
- if ( (candidate->len == base_len)
- && (memcmp(candidate->text, base_text, base_len) == 0)
- ) {
- j++;
- }
- else {
- break;
- }
- }
-
- /* Record a count at the position of the first token in the cluster. */
- counts[i] = j - i;
-
- /* Start the next loop at the next token we haven't seen. */
- i = j;
- }
-}
-
-/* Copyright 2006-2008 Marvin Humphrey
- *
- * This program is free software; you can redistribute it and/or modify
- * under the same terms as Perl itself.
- */
-

Modified: trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -11,18 +11,18 @@
static Tokenizer*
init(Tokenizer *self);

- public incremented TokenBatch*
- Transform(Tokenizer *self, TokenBatch *batch);
+ public incremented Inversion*
+ Transform(Tokenizer *self, Inversion *inversion);

- public incremented TokenBatch*
+ public incremented Inversion*
Transform_Text(Tokenizer *self, CharBuf *text);

/** Tokenize the supplied string and add any Tokens generated to the
- * supplied batch.
+ * supplied Inversion.
*/
void
Tokenize_Str(Tokenizer *self, const char *text, size_t len,
- TokenBatch *batch);
+ Inversion *inversion);

void
Destroy(Tokenizer *self);

Modified: trunk/c_src/KinoSearch/Analysis/Tokenizer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Tokenizer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Analysis/Tokenizer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/Tokenizer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/Native.h"

Tokenizer*
@@ -21,26 +21,26 @@
return self;
}

-TokenBatch*
-Tokenizer_transform(Tokenizer *self, TokenBatch *batch)
+Inversion*
+Tokenizer_transform(Tokenizer *self, Inversion *inversion)
{
- TokenBatch *new_batch = TokenBatch_new(NULL);
+ Inversion *new_inversion = Inversion_new(NULL);
Token *token;

- while (NULL != (token = TokenBatch_Next(batch))) {
- Tokenizer_Tokenize_Str(self, token->text, token->len, new_batch);
+ while (NULL != (token = Inversion_Next(inversion))) {
+ Tokenizer_Tokenize_Str(self, token->text, token->len, new_inversion);
}

- return new_batch;
+ return new_inversion;
}

-TokenBatch*
+Inversion*
Tokenizer_transform_text(Tokenizer *self, CharBuf *text)
{
- TokenBatch *new_batch = TokenBatch_new(NULL);
+ Inversion *new_inversion = Inversion_new(NULL);
Tokenizer_Tokenize_Str(self, (char*)CB_Get_Ptr8(text), CB_Get_Size(text),
- new_batch);
- return new_batch;
+ new_inversion);
+ return new_inversion;
}

void

Modified: trunk/c_src/KinoSearch/Index/Inverter.bp
===================================================================
--- trunk/c_src/KinoSearch/Index/Inverter.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/Inverter.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -9,7 +9,7 @@
Doc *doc;
ViewCharBuf **fields;
ViewCharBuf **values;
- TokenBatch **inversions;
+ Inversion **inversions;
FieldSpec **specs;
Analyzer **analyzers;
Similarity **sims;
@@ -88,10 +88,10 @@
Similarity*
Get_Similarity(Inverter *self);

- /** Return the Inverted TokenBatch for the current field, provided that
- * that field is indexed; return NULL if the iterator is exhausted.
+ /** Return the Inversion for the current field, provided that that field
+ * is indexed; return NULL if the iterator is exhausted.
*/
- TokenBatch*
+ Inversion*
Get_Inversion(Inverter *self);

void

Modified: trunk/c_src/KinoSearch/Index/Inverter.c
===================================================================
--- trunk/c_src/KinoSearch/Index/Inverter.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/Inverter.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -3,7 +3,7 @@
#include "KinoSearch/Index/Inverter.h"
#include "KinoSearch/Analysis/Analyzer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Doc.h"
#include "KinoSearch/FieldSpec.h"
#include "KinoSearch/Schema.h"
@@ -29,7 +29,7 @@
self->specs = CALLOCATE(1, FieldSpec*);
self->analyzers = CALLOCATE(1, Analyzer*);
self->sims = CALLOCATE(1, Similarity*);
- self->inversions = CALLOCATE(1, TokenBatch*);
+ self->inversions = CALLOCATE(1, Inversion*);
self->fields[0] = ViewCB_new_from_trusted_utf8(NULL, 0);
self->values[0] = ViewCB_new_from_trusted_utf8(NULL, 0);

@@ -95,9 +95,9 @@
Inverter_get_similarity(Inverter *self)
{ return (Similarity*)maybe_get_obj(self, (Obj**)self->sims); }

-TokenBatch*
+Inversion*
Inverter_get_inversion(Inverter *self)
-{ return (TokenBatch*)maybe_get_obj(self, (Obj**)self->inversions); }
+{ return (Inversion*)maybe_get_obj(self, (Obj**)self->inversions); }

void
Inverter_add_field(Inverter *self, FieldSpec *fspec,
@@ -130,19 +130,19 @@

if (!fspec) CONFESS("Unknown field: '%s'", field_name);

- /* Get a TokenBatch, going through analyzer if appropriate. */
+ /* Get an Inversion, going through analyzer if appropriate. */
if (fspec->analyzed) {
self->inversions[self->num_fields]
= Analyzer_Transform_Text(analyzer, (CharBuf*)value_vcb);
- TokenBatch_Invert(self->inversions[self->num_fields]);
+ Inversion_Invert(self->inversions[self->num_fields]);
}
else if (fspec->indexed || fspec->vectorized) {
size_t token_len = ViewCB_Get_Size(value_vcb);
Token *seed = Token_new((char*)ViewCB_Get_Ptr8(value_vcb), token_len,
0, token_len, 1.0f, 1);
- self->inversions[self->num_fields] = TokenBatch_new(seed);
+ self->inversions[self->num_fields] = Inversion_new(seed);
REFCOUNT_DEC(seed);
- TokenBatch_Invert(self->inversions[self->num_fields]);
+ Inversion_Invert(self->inversions[self->num_fields]);
}
else {
self->inversions[self->num_fields] = NULL;
@@ -161,7 +161,7 @@
self->specs = REALLOCATE(self->specs, max_fields, FieldSpec*);
self->sims = REALLOCATE(self->sims, max_fields, Similarity*);
self->analyzers = REALLOCATE(self->analyzers, max_fields, Analyzer*);
- self->inversions = REALLOCATE(self->inversions, max_fields, TokenBatch*);
+ self->inversions = REALLOCATE(self->inversions, max_fields, Inversion*);

for ( ; self->max_fields < max_fields; self->max_fields++) {
self->fields[self->max_fields]

Modified: trunk/c_src/KinoSearch/Index/PostingPool.bp
===================================================================
--- trunk/c_src/KinoSearch/Index/PostingPool.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/PostingPool.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -41,11 +41,11 @@
void
Add_Elem(PostingPool *self, Obj *elem);

- /* Add a field's content, in the form of an inverted TokenBatch.
+ /* Add a field's inverted content.
*/
void
- Add_Batch(PostingPool *self, TokenBatch *batch, i32_t doc_num,
- float doc_boost, float length_norm);
+ Add_Inversion(PostingPool *self, Inversion *inversion, i32_t doc_num,
+ float doc_boost, float length_norm);

/* Dedicated this PostingPool to read back from existing segment content.
*/

Modified: trunk/c_src/KinoSearch/Index/PostingPool.c
===================================================================
--- trunk/c_src/KinoSearch/Index/PostingPool.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/PostingPool.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -1,7 +1,7 @@
#include "KinoSearch/Util/ToolSet.h"

#include "KinoSearch/Index/PostingPool.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Posting.h"
#include "KinoSearch/Posting/RawPosting.h"
#include "KinoSearch/Schema.h"
@@ -104,11 +104,11 @@
}

void
-PostPool_add_batch(PostingPool *self, TokenBatch *batch,
+PostPool_add_inversion(PostingPool *self, Inversion *inversion,
i32_t doc_num, float doc_boost,
float length_norm)
{
- Post_Add_Batch_To_Pool(self->posting, self, batch, self->fspec,
+ Post_Add_Inversion_To_Pool(self->posting, self, inversion, self->fspec,
doc_num, doc_boost, length_norm);
}


Modified: trunk/c_src/KinoSearch/Index/PostingsWriter.bp
===================================================================
--- trunk/c_src/KinoSearch/Index/PostingsWriter.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/PostingsWriter.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -33,12 +33,12 @@
void
Add(PostingsWriter *self, Inverter *inverter, u32_t doc_num);

- /* Add a field's content, in the form of an inverted TokenBatch.
+ /* Add a field's inverted content.
*/
void
- Add_Batch(PostingsWriter *self, TokenBatch *token_batch,
- const CharBuf *field_name, i32_t doc_num, float doc_boost,
- float length_norm);
+ Add_Inversion(PostingsWriter *self, Inversion *inversion,
+ const CharBuf *field_name, i32_t doc_num, float doc_boost,
+ float length_norm);

void
Add_Segment(PostingsWriter *self, SegReader *reader,

Modified: trunk/c_src/KinoSearch/Index/PostingsWriter.c
===================================================================
--- trunk/c_src/KinoSearch/Index/PostingsWriter.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/PostingsWriter.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -7,7 +7,7 @@
#include "KinoSearch/Posting/RawPosting.h"
#include "KinoSearch/Schema.h"
#include "KinoSearch/InvIndex.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Index/IndexFileNames.h"
#include "KinoSearch/Index/Inverter.h"
#include "KinoSearch/Index/PostingPool.h"
@@ -135,16 +135,16 @@
if (fspec->indexed) {
ViewCharBuf *field_name = Inverter_Get_Field_Name(inverter);
Similarity *sim = Inverter_Get_Similarity(inverter);
- TokenBatch *inversion = Inverter_Get_Inversion(inverter);
+ Inversion *inversion = Inverter_Get_Inversion(inverter);
float length_norm = Sim_Length_Norm(sim, inversion->size);
- PostWriter_add_batch(self, inversion, (CharBuf*)field_name, doc_num,
+ PostWriter_add_inversion(self, inversion, (CharBuf*)field_name, doc_num,
doc_boost, length_norm);
}
}
}

void
-PostWriter_add_batch(PostingsWriter *self, TokenBatch *batch,
+PostWriter_add_inversion(PostingsWriter *self, Inversion *inversion,
const CharBuf *field_name, i32_t doc_num,
float doc_boost, float length_norm)
{
@@ -160,8 +160,9 @@
}
post_pool = (PostingPool*)VA_Fetch(field_post_pools, 0);

- /* Add the TokenBatch to the PostingPool. */
- PostPool_Add_Batch(post_pool, batch, doc_num, doc_boost, length_norm);
+ /* Add the Inversion to the PostingPool. */
+ PostPool_Add_Inversion(post_pool, inversion, doc_num, doc_boost,
+ length_norm);

/* Check if we've crossed the memory threshold and it's time to flush. */
if (self->mem_pool->consumed > self->mem_thresh)

Modified: trunk/c_src/KinoSearch/Index/TermVectorsWriter.bp
===================================================================
--- trunk/c_src/KinoSearch/Index/TermVectorsWriter.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/TermVectorsWriter.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -18,7 +18,7 @@
init(TermVectorsWriter *self, InvIndex *invindex, SegInfo *seg_info);

incremented ByteBuf*
- TV_Buf(TermVectorsWriter *self, TokenBatch *batch);
+ TV_Buf(TermVectorsWriter *self, Inversion *inversion);

void
Add(TermVectorsWriter *self, Inverter *inverter, u32_t doc_num);

Modified: trunk/c_src/KinoSearch/Index/TermVectorsWriter.c
===================================================================
--- trunk/c_src/KinoSearch/Index/TermVectorsWriter.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Index/TermVectorsWriter.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -4,7 +4,7 @@

#include "KinoSearch/Index/TermVectorsWriter.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/FieldSpec.h"
#include "KinoSearch/InvIndex.h"
#include "KinoSearch/Index/IndexFileNames.h"
@@ -73,7 +73,7 @@
FieldSpec *fspec = Inverter_Get_FSpec(inverter);
if (fspec->vectorized) {
ViewCharBuf *field_name = Inverter_Get_Field_Name(inverter);
- TokenBatch *inversion = Inverter_Get_Inversion(inverter);
+ Inversion *inversion = Inverter_Get_Inversion(inverter);
ByteBuf *tv_buf = TVWriter_TV_Buf(self, inversion);
CB_Serialize(field_name, tv_out);
BB_Serialize(tv_buf, tv_out);
@@ -87,11 +87,11 @@
}

ByteBuf*
-TVWriter_tv_buf(TermVectorsWriter *self, TokenBatch *batch)
+TVWriter_tv_buf(TermVectorsWriter *self, Inversion *inversion)
{
char *last_text = "";
size_t last_len = 0;
- ByteBuf *tv_buf = BB_new(20 + batch->size * 8); /* generous */
+ ByteBuf *tv_buf = BB_new(20 + inversion->size * 8); /* generous */
u32_t num_postings = 0;
char *dest;
Token **tokens;
@@ -101,8 +101,8 @@
/* Leave space for a c32 indicating the number of postings. */
tv_buf->len = C32_MAX_BYTES;

- TokenBatch_Reset(batch);
- while ( (tokens = TokenBatch_Next_Cluster(batch, &freq)) != NULL ) {
+ Inversion_Reset(inversion);
+ while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
Token *token = *tokens;
i32_t overlap = StrHelp_string_diff(last_text, token->text,
last_len, token->len);

Modified: trunk/c_src/KinoSearch/Posting/RichPosting.bp
===================================================================
--- trunk/c_src/KinoSearch/Posting/RichPosting.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Posting/RichPosting.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -36,9 +36,10 @@
CharBuf *term_text, MemoryPool *mem_pool);

void
- Add_Batch_To_Pool(RichPosting *self, PostingPool *post_pool,
- TokenBatch *batch, FieldSpec *fspec,
- u32_t doc_num, float doc_boost, float length_norm);
+ Add_Inversion_To_Pool(RichPosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ u32_t doc_num, float doc_boost,
+ float length_norm);

public incremented RichPostingScorer*
Make_Scorer(RichPosting *self, Similarity *sim, PostingList *plist,

Modified: trunk/c_src/KinoSearch/Posting/RichPosting.c
===================================================================
--- trunk/c_src/KinoSearch/Posting/RichPosting.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Posting/RichPosting.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -3,7 +3,7 @@

#include "KinoSearch/Posting/RichPosting.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Index/PostingList.h"
#include "KinoSearch/Index/PostingPool.h"
#include "KinoSearch/Posting/RawPosting.h"
@@ -111,9 +111,10 @@
}

void
-RichPost_add_batch_to_pool(RichPosting *self, PostingPool *post_pool,
- TokenBatch *batch, FieldSpec *fspec,
- u32_t doc_num, float doc_boost, float length_norm)
+RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ u32_t doc_num, float doc_boost,
+ float length_norm)
{
MemoryPool *mem_pool = post_pool->mem_pool;
Similarity *sim = self->sim;
@@ -121,8 +122,8 @@
Token **tokens;
u32_t freq;

- TokenBatch_Reset(batch);
- while ( (tokens = TokenBatch_Next_Cluster(batch, &freq)) != NULL ) {
+ Inversion_Reset(inversion);
+ while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
Token *token = *tokens;
u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq);
RawPosting *raw_posting = RawPost_new(

Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.bp
===================================================================
--- trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -34,9 +34,10 @@
CharBuf *term_text, MemoryPool *mem_pool);

void
- Add_Batch_To_Pool(ScorePosting *self, PostingPool *post_pool,
- TokenBatch *batch, FieldSpec *fspec,
- u32_t doc_num, float doc_boost, float length_norm);
+ Add_Inversion_To_Pool(ScorePosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ u32_t doc_num, float doc_boost,
+ float length_norm);

void
Reset(ScorePosting *self, u32_t doc_num);

Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.c
===================================================================
--- trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -3,7 +3,7 @@

#include "KinoSearch/Posting/ScorePosting.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Index/PostingList.h"
#include "KinoSearch/Index/PostingPool.h"
#include "KinoSearch/Posting/RawPosting.h"
@@ -70,9 +70,10 @@
}

void
-ScorePost_add_batch_to_pool(ScorePosting *self, PostingPool *post_pool,
- TokenBatch *batch, FieldSpec *fspec,
- u32_t doc_num, float doc_boost, float length_norm)
+ScorePost_add_inversion_to_pool(ScorePosting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ u32_t doc_num, float doc_boost,
+ float length_norm)
{
MemoryPool *mem_pool = post_pool->mem_pool;
Similarity *sim = self->sim;
@@ -81,8 +82,8 @@
Token **tokens;
u32_t freq;

- TokenBatch_Reset(batch);
- while ( (tokens = TokenBatch_Next_Cluster(batch, &freq)) != NULL ) {
+ Inversion_Reset(inversion);
+ while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
Token *token = *tokens;
u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq);
RawPosting *raw_posting = RawPost_new(

Modified: trunk/c_src/KinoSearch/Posting.bp
===================================================================
--- trunk/c_src/KinoSearch/Posting.bp 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/c_src/KinoSearch/Posting.bp 2008-08-05 17:33:59 UTC (rev 3723)
@@ -25,13 +25,14 @@
Read_Raw(Posting *self, InStream *instream, u32_t last_doc_num,
CharBuf *term_text, MemoryPool *mem_pool);

- /** Process a TokenBatch into RawPosting objects and add them all to the
+ /** Process an Inversion into RawPosting objects and add them all to the
* supplied PostingPool.
*/
abstract void
- Add_Batch_To_Pool(Posting *self, PostingPool *post_pool,
- TokenBatch *batch, FieldSpec *fspec, u32_t doc_num,
- float doc_boost, float length_norm);
+ Add_Inversion_To_Pool(Posting *self, PostingPool *post_pool,
+ Inversion *inversion, FieldSpec *fspec,
+ u32_t doc_num, float doc_boost,
+ float length_norm);

/** Prepare the posting to start reading after a seek.
*/

Modified: trunk/devel/benchmarks/indexers/BenchmarkingIndexer.pm
===================================================================
--- trunk/devel/benchmarks/indexers/BenchmarkingIndexer.pm 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/devel/benchmarks/indexers/BenchmarkingIndexer.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -136,7 +136,6 @@

require KinoSearch;
require KinoSearch::InvIndexer;
- require KinoSearch::Analysis::TokenBatch;

# provide runtime flexibility
my $schema = $self->{schema} = BenchSchema->new;

Modified: trunk/perl/MANIFEST
===================================================================
--- trunk/perl/MANIFEST 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/MANIFEST 2008-08-05 17:33:59 UTC (rev 3723)
@@ -51,6 +51,7 @@
charmonizer/Charmonizer/Test/VariadicMacros.c
lib/KinoSearch.pm
lib/KinoSearch/Analysis/Analyzer.pm
+lib/KinoSearch/Analysis/Inversion.pm
lib/KinoSearch/Analysis/LCNormalizer.pm
lib/KinoSearch/Analysis/PolyAnalyzer.pm
lib/KinoSearch/Analysis/Stemmer.pm
@@ -308,7 +309,7 @@
t/113-cf_reader.t
t/150-polyanalyzer.t
t/151-analyzer.t
-t/152-token_batch.t
+t/152-inversion.t
t/153-lc_normalizer.t
t/154-tokenizer.t
t/155-stopalizer.t

Modified: trunk/perl/buildlib/KinoTestUtils.pm
===================================================================
--- trunk/perl/buildlib/KinoTestUtils.pm 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/buildlib/KinoTestUtils.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -186,17 +186,17 @@
sub test_analyzer {
my ( $analyzer, $source, $expected, $message ) = @_;

- my $batch = KinoSearch::Analysis::TokenBatch->new( text => $source );
- $batch = $analyzer->transform($batch);
+ my $inversion = KinoSearch::Analysis::Inversion->new( text => $source );
+ $inversion = $analyzer->transform($inversion);
my @got;
- while ( my $token = $batch->next ) {
+ while ( my $token = $inversion->next ) {
push @got, $token->get_text;
}
Test::More::is_deeply( \@got, $expected, "analyze: $message" );

- $batch = $analyzer->transform_text($source);
- @got = ();
- while ( my $token = $batch->next ) {
+ $inversion = $analyzer->transform_text($source);
+ @got = ();
+ while ( my $token = $inversion->next ) {
push @got, $token->get_text;
}
Test::More::is_deeply( \@got, $expected, "transform_text: $message" );

Copied: trunk/perl/lib/KinoSearch/Analysis/Inversion.pm (from rev 3722, trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm)
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Inversion.pm (rev 0)
+++ trunk/perl/lib/KinoSearch/Analysis/Inversion.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -0,0 +1,44 @@
+use KinoSearch;
+
+1;
+
+__END__
+
+__XS__
+
+MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::Inversion
+
+SV*
+new(...)
+CODE:
+{
+ kino_Token *starter_token = NULL;
+ /* parse params, only if there's more than one arg */
+ if (items > 1) {
+ HV *const args_hash = build_args_hash( &(ST(0)), 1, items,
+ "KinoSearch::Analysis::Inversion::new_PARAMS");
+ SV *text_sv = extract_sv(args_hash, SNL("text"));
+ STRLEN len;
+ char *text = SvPVutf8(text_sv, len);
+ starter_token = kino_Token_new(text, len, 0, len, 1.0, 1);
+ }
+
+ KOBJ_TO_SV_NOINC( kino_Inversion_new(starter_token), RETVAL );
+ REFCOUNT_DEC(starter_token);
+}
+OUTPUT: RETVAL
+
+__AUTO_XS__
+
+{ "KinoSearch::Analysis::Inversion" => {
+ bind_methods => [qw( append reset invert next )],
+ }
+}
+
+__COPYRIGHT__
+
+Copyright 2005-2008 Marvin Humphrey
+
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.
+

Deleted: trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -1,57 +0,0 @@
-use KinoSearch;
-
-1;
-
-__END__
-
-__XS__
-
-MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::TokenBatch
-
-SV*
-new(...)
-CODE:
-{
- kino_Token *starter_token = NULL;
- /* parse params, only if there's more than one arg */
- if (items > 1) {
- HV *const args_hash = build_args_hash( &(ST(0)), 1, items,
- "KinoSearch::Analysis::TokenBatch::new_PARAMS");
- SV *text_sv = extract_sv(args_hash, SNL("text"));
- STRLEN len;
- char *text = SvPVutf8(text_sv, len);
- starter_token = kino_Token_new(text, len, 0, len, 1.0, 1);
- }
-
- KOBJ_TO_SV_NOINC( kino_TokenBatch_new(starter_token), RETVAL );
- REFCOUNT_DEC(starter_token);
-}
-OUTPUT: RETVAL
-
-__AUTO_XS__
-
-{ "KinoSearch::Analysis::TokenBatch" => {
- bind_methods => [qw( append reset invert next )],
- }
-}
-
-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::TokenBatch - Redacted.
-
-=head1 REDACTED
-
-TokenBatch's public API has been redacted.
-
-=head1 COPYRIGHT
-
-Copyright 2005-2008 Marvin Humphrey
-
-=head1 LICENSE, DISCLAIMER, BUGS, etc.
-
-See L<KinoSearch> version 0.20.
-
-=cut
-

Added: trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm (rev 0)
+++ trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -0,0 +1,24 @@
+use KinoSearch;
+
+1;
+
+__END__
+
+=head1 NAME
+
+KinoSearch::Analysis::TokenBatch - Removed.
+
+=head1 REMOVED
+
+TokenBatch has been removed from the KinoSearch suite as of version 0.20.
+
+=head1 COPYRIGHT
+
+Copyright 2005-2008 Marvin Humphrey
+
+=head1 LICENSE, DISCLAIMER, BUGS, etc.
+
+See L<KinoSearch> version 0.20.
+
+=cut
+

Modified: trunk/perl/lib/KinoSearch.pm
===================================================================
--- trunk/perl/lib/KinoSearch.pm 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/lib/KinoSearch.pm 2008-08-05 17:33:59 UTC (rev 3723)
@@ -70,6 +70,15 @@
}

{
+ package KinoSearch::Analysis::Inversion;
+
+ our %new_PARAMS = (
+ # params
+ text => undef
+ );
+}
+
+{
package KinoSearch::Analysis::Stemmer;
sub lazy_load_snowball { require Lingua::Stem::Snowball }
}
@@ -104,15 +113,6 @@
}

{
- package KinoSearch::Analysis::TokenBatch;
-
- our %new_PARAMS = (
- # params
- text => undef
- );
-}
-
-{
package KinoSearch::Analysis::Tokenizer;

# Inside-out member var.

Modified: trunk/perl/t/150-polyanalyzer.t
===================================================================
--- trunk/perl/t/150-polyanalyzer.t 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/t/150-polyanalyzer.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -11,7 +11,7 @@
use KinoSearch::Analysis::Stopalizer;
use KinoSearch::Analysis::Stemmer;
use KinoSearch::Analysis::PolyAnalyzer;
-use KinoSearch::Analysis::TokenBatch;
+use KinoSearch::Analysis::Inversion;

my $source_text = 'Eats, shoots and leaves.';


Copied: trunk/perl/t/152-inversion.t (from rev 3712, trunk/perl/t/152-token_batch.t)
===================================================================
--- trunk/perl/t/152-inversion.t (rev 0)
+++ trunk/perl/t/152-inversion.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -0,0 +1,68 @@
+use strict;
+use warnings;
+use lib 'buildlib';
+
+use Test::More tests => 4;
+
+use KinoSearch::Analysis::Inversion;
+use KinoSearch::Analysis::Token;
+
+use KinoTestUtils qw( utf8_test_strings );
+
+my $inversion = KinoSearch::Analysis::Inversion->new;
+$inversion->append(
+ KinoSearch::Analysis::Token->new(
+ text => "car",
+ start_offset => 0,
+ end_offset => 3,
+ ),
+);
+$inversion->append(
+ KinoSearch::Analysis::Token->new(
+ text => "bike",
+ start_offset => 10,
+ end_offset => 14,
+ ),
+);
+$inversion->append(
+ KinoSearch::Analysis::Token->new(
+ text => "truck",
+ start_offset => 20,
+ end_offset => 25,
+ ),
+);
+
+my @texts;
+while ( my $token = $inversion->next ) {
+ push @texts, $token->get_text;
+}
+is_deeply( \@texts, [qw( car bike truck )], "return tokens in order" );
+
+$inversion = KinoSearch::Analysis::Inversion->new;
+$inversion->append(
+ KinoSearch::Analysis::Token->new(
+ text => "foo",
+ start_offset => 0,
+ end_offset => 3,
+ pos_inc => 10,
+ ),
+);
+$inversion->append(
+ KinoSearch::Analysis::Token->new(
+ text => "bar",
+ start_offset => 4,
+ end_offset => 7,
+ pos_inc => ( 2**31 - 2 ),
+ ),
+);
+eval { $inversion->invert; };
+like( $@, qr/position/, "catch overflow in token position calculation" );
+
+my ( $smiley, $not_a_smiley, $frowny ) = utf8_test_strings();
+
+$inversion = KinoSearch::Analysis::Inversion->new( text => $smiley );
+is( $inversion->next->get_text, $smiley,
+ "Inversion->new handles UTF-8 correctly" );
+$inversion = KinoSearch::Analysis::Inversion->new( text => $not_a_smiley );
+is( $inversion->next->get_text, $frowny,
+ "Inversion->new upgrades non-UTF-8 correctly" );

Deleted: trunk/perl/t/152-token_batch.t
===================================================================
--- trunk/perl/t/152-token_batch.t 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/t/152-token_batch.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -1,68 +0,0 @@
-use strict;
-use warnings;
-use lib 'buildlib';
-
-use Test::More tests => 4;
-
-use KinoSearch::Analysis::TokenBatch;
-use KinoSearch::Analysis::Token;
-
-use KinoTestUtils qw( utf8_test_strings );
-
-my $batch = KinoSearch::Analysis::TokenBatch->new;
-$batch->append(
- KinoSearch::Analysis::Token->new(
- text => "car",
- start_offset => 0,
- end_offset => 3,
- ),
-);
-$batch->append(
- KinoSearch::Analysis::Token->new(
- text => "bike",
- start_offset => 10,
- end_offset => 14,
- ),
-);
-$batch->append(
- KinoSearch::Analysis::Token->new(
- text => "truck",
- start_offset => 20,
- end_offset => 25,
- ),
-);
-
-my @texts;
-while ( my $token = $batch->next ) {
- push @texts, $token->get_text;
-}
-is_deeply( \@texts, [qw( car bike truck )], "return tokens in order" );
-
-$batch = KinoSearch::Analysis::TokenBatch->new;
-$batch->append(
- KinoSearch::Analysis::Token->new(
- text => "foo",
- start_offset => 0,
- end_offset => 3,
- pos_inc => 10,
- ),
-);
-$batch->append(
- KinoSearch::Analysis::Token->new(
- text => "bar",
- start_offset => 4,
- end_offset => 7,
- pos_inc => ( 2**31 - 2 ),
- ),
-);
-eval { $batch->invert; };
-like( $@, qr/position/, "catch overflow in token position calculation" );
-
-my ( $smiley, $not_a_smiley, $frowny ) = utf8_test_strings();
-
-$batch = KinoSearch::Analysis::TokenBatch->new( text => $smiley );
-is( $batch->next->get_text, $smiley,
- "TokenBatch->new handles UTF-8 correctly" );
-$batch = KinoSearch::Analysis::TokenBatch->new( text => $not_a_smiley );
-is( $batch->next->get_text, $frowny,
- "TokenBatch->new upgrades non-UTF-8 correctly" );

Modified: trunk/perl/t/153-lc_normalizer.t
===================================================================
--- trunk/perl/t/153-lc_normalizer.t 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/t/153-lc_normalizer.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -7,7 +7,6 @@

use KinoSearch::Analysis::LCNormalizer;
use KinoSearch::Analysis::Token;
-use KinoSearch::Analysis::TokenBatch;

my $lc_normalizer = KinoSearch::Analysis::LCNormalizer->new;


Modified: trunk/perl/t/154-tokenizer.t
===================================================================
--- trunk/perl/t/154-tokenizer.t 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/t/154-tokenizer.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -4,18 +4,18 @@
use Test::More tests => 8;

use KinoSearch::Analysis::Tokenizer;
-use KinoSearch::Analysis::TokenBatch;
+use KinoSearch::Analysis::Inversion;

my $tokenizer = KinoSearch::Analysis::Tokenizer->new;

my $text = $tokenizer->split("o'malley's")->[0];
is( $text, "o'malley's", "multiple apostrophes for default token_re" );

-my $batch = KinoSearch::Analysis::TokenBatch->new( text => "a b c" );
-$batch = $tokenizer->transform($batch);
+my $inversion = KinoSearch::Analysis::Inversion->new( text => "a b c" );
+$inversion = $tokenizer->transform($inversion);

my ( @token_texts, @start_offsets, @end_offsets );
-while ( my $token = $batch->next ) {
+while ( my $token = $inversion->next ) {
push @token_texts, $token->get_text;
push @start_offsets, $token->get_start_offset;
push @end_offsets, $token->get_end_offset;
@@ -25,13 +25,13 @@
is_deeply( \@end_offsets, [ 1, 3, 5, ], "correct end offsets" );

$tokenizer = KinoSearch::Analysis::Tokenizer->new( token_re => qr/./ );
-$batch = KinoSearch::Analysis::TokenBatch->new( text => "a b c" );
-$batch = $tokenizer->transform($batch);
+$inversion = KinoSearch::Analysis::Inversion->new( text => "a b c" );
+$inversion = $tokenizer->transform($inversion);

@token_texts = ();
@start_offsets = ();
@end_offsets = ();
-while ( my $token = $batch->next ) {
+while ( my $token = $inversion->next ) {
push @token_texts, $token->get_text;
push @start_offsets, $token->get_start_offset;
push @end_offsets, $token->get_end_offset;
@@ -40,10 +40,10 @@
is_deeply( \@start_offsets, [ 0 .. 4 ], "starts: custom re" );
is_deeply( \@end_offsets, [ 1 .. 5 ], "ends: custom re" );

-$batch->reset;
-$batch = $tokenizer->transform($batch);
+$inversion->reset;
+$inversion = $tokenizer->transform($inversion);
@token_texts = ();
-while ( my $token = $batch->next ) {
+while ( my $token = $inversion->next ) {
push @token_texts, $token->get_text;
}
is_deeply(

Modified: trunk/perl/t/605-store_pos_boost.t
===================================================================
--- trunk/perl/t/605-store_pos_boost.t 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/t/605-store_pos_boost.t 2008-08-05 17:33:59 UTC (rev 3723)
@@ -4,20 +4,20 @@

package MyTokenizer;
use base qw( KinoSearch::Analysis::Analyzer );
-use KinoSearch::Analysis::TokenBatch;
+use KinoSearch::Analysis::Inversion;

sub transform {
- my ( $self, $batch ) = @_;
- my $new_batch = KinoSearch::Analysis::TokenBatch->new;
+ my ( $self, $inversion ) = @_;
+ my $new_inversion = KinoSearch::Analysis::Inversion->new;

- while ( my $token = $batch->next ) {
+ while ( my $token = $inversion->next ) {
for ( $token->get_text ) {
my $this_time = /z/ ? 1 : 0;
# Accumulate token start_offsets and end_offsets.
while (/(\w)/g) {
# Special boost just for one doc.
my $boost = ( $1 eq 'a' and $this_time ) ? 100 : 1;
- $new_batch->append(
+ $new_inversion->append(
KinoSearch::Analysis::Token->new(
text => $1,
start_offset => $-[0],
@@ -29,7 +29,7 @@
}
}

- return $new_batch;
+ return $new_inversion;
}

package MySchema::boosted;

Modified: trunk/perl/xs/KinoSearch/Analysis/LCNormalizer.c
===================================================================
--- trunk/perl/xs/KinoSearch/Analysis/LCNormalizer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/xs/KinoSearch/Analysis/LCNormalizer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,7 +2,7 @@

#include "KinoSearch/Analysis/LCNormalizer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/ByteBuf.h"
#include "KinoSearch/Util/MemManager.h"
#include "KinoSearch/Util/Native.h"
@@ -41,11 +41,12 @@
*dest = '\0';
}

-kino_TokenBatch*
-kino_LCNormalizer_transform(kino_LCNormalizer *self, kino_TokenBatch *batch)
+kino_Inversion*
+kino_LCNormalizer_transform(kino_LCNormalizer *self,
+ kino_Inversion *inversion)
{
kino_Token *token;
- while (NULL != (token = Kino_TokenBatch_Next(batch))) {
+ while (NULL != (token = Kino_Inversion_Next(inversion))) {
lc_to_work_buf(self, (chy_u8_t*)token->text, token->len);
if (self->work_buf->len > token->len) {
free(token->text);
@@ -54,19 +55,19 @@
memcpy(token->text, self->work_buf->ptr, self->work_buf->len + 1);
token->len = self->work_buf->len;
}
- Kino_TokenBatch_Reset(batch);
- return REFCOUNT_INC(batch);
+ Kino_Inversion_Reset(inversion);
+ return REFCOUNT_INC(inversion);
}

-kino_TokenBatch*
+kino_Inversion*
kino_LCNormalizer_transform_text(kino_LCNormalizer *self, kino_CharBuf *text)
{
- kino_TokenBatch *retval;
+ kino_Inversion *retval;
kino_Token *token;
lc_to_work_buf(self, (chy_u8_t*)text->ptr, Kino_CB_Get_Size(text));
token = kino_Token_new(self->work_buf->ptr, self->work_buf->len, 0,
self->work_buf->len, 1.0f, 1);
- retval = kino_TokenBatch_new(token);
+ retval = kino_Inversion_new(token);
REFCOUNT_DEC(token);
return retval;
}

Modified: trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c
===================================================================
--- trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c 2008-08-05 16:16:25 UTC (rev 3722)
+++ trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c 2008-08-05 17:33:59 UTC (rev 3723)
@@ -2,12 +2,12 @@

#include "KinoSearch/Analysis/Tokenizer.h"
#include "KinoSearch/Analysis/Token.h"
-#include "KinoSearch/Analysis/TokenBatch.h"
+#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Util/StringHelper.h"

void
kino_Tokenizer_tokenize_str(kino_Tokenizer *self, const char *string,
- size_t string_len, kino_TokenBatch *batch)
+ size_t string_len, kino_Inversion *inversion)
{
chy_u32_t num_code_points = 0;
SV *wrapper = sv_newmortal();
@@ -55,7 +55,7 @@
}
end = num_code_points;

- /* Add a token to the new_batch. */
+ /* Add a token to the new_inversion. */
new_token = kino_Token_new(
start_ptr,
(end_ptr - start_ptr),
@@ -64,7 +64,7 @@
1.0f, /* boost always 1 for now */
1 /* position increment */
);
- Kino_TokenBatch_Append(batch, new_token);
+ Kino_Inversion_Append(inversion, new_token);
REFCOUNT_DEC(new_token);
}
}


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits