Mailing List Archive

r3702 - in trunk/perl: lib/KinoSearch/Analysis t
Author: creamyg
Date: 2008-08-03 08:39:16 -0700 (Sun, 03 Aug 2008)
New Revision: 3702

Modified:
trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
trunk/perl/t/605-store_pos_boost.t
Log:
Kill off TokenBatch hand-rolled XS: add_many_tokens, set_all_texts,
get_all_texts.


Modified: trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-03 14:41:06 UTC (rev 3701)
+++ trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-03 15:39:16 UTC (rev 3702)
@@ -28,153 +28,6 @@
}
OUTPUT: RETVAL

-=for comment
-
-Add many tokens to the batch, by supplying the string to be tokenized, and
-arrays of token starts and token ends.
-
-=cut
-
-void
-add_many_tokens(self, string_sv, starts_av, ends_av, ...)
- kino_TokenBatch *self;
- SV *string_sv;
- AV *starts_av;
- AV *ends_av;
-PPCODE:
-{
- const chy_u32_t num_starts = av_len(starts_av) + 1;
- size_t len;
- char *string_top = SvPV(string_sv, len);
- char *ptr = string_top;
- char *token_start = string_top;
- char *limit = SvEND(string_sv);
- size_t num_code_points = 0;
- size_t i;
- AV *boosts_av = NULL;
-
- if( !SvUTF8(string_sv) )
- CONFESS("source string not encoded as UTF-8");
-
- if (items == 5) {
- if (SvROK(ST(4)) && SvTYPE(SvRV(ST(4)))==SVt_PVAV)
- boosts_av = (AV*)SvRV(ST(4));
- else
- CONFESS("boosts_av is not an array reference");
- }
-
- for (i = 0; i < num_starts; i++) {
- size_t start_offset, end_offset;
- kino_Token *token;
- float boost = 1.0;
-
- /* retrieve start and end */
- SV **const start_sv_ptr = av_fetch(starts_av, i, 0);
- SV **const end_sv_ptr = av_fetch(ends_av, i, 0);
- if (start_sv_ptr == NULL)
- CONFESS("Failed to retrieve @starts array element");
- if (end_sv_ptr == NULL)
- CONFESS("Failed to retrieve @ends array element");
- start_offset = SvUV(*start_sv_ptr);
- end_offset = SvUV(*end_sv_ptr);
-
- /* retrieve boost, if supplied */
- if (boosts_av != NULL) {
- SV **const boost_sv_ptr = av_fetch(boosts_av, i, 0);
- if (boost_sv_ptr == NULL)
- CONFESS("Failed to retrieve @boosts array element");
- boost = (float)SvNV(*boost_sv_ptr);
- }
-
- /* scan to, or continue scanning to, the start and end offsets */
- for ( ; num_code_points < start_offset; num_code_points++) {
- ptr += KINO_STRHELP_UTF8_SKIP[(chy_u8_t)*ptr];
- if (ptr > limit)
- CONFESS("scanned past end of '%s'", string_top);
- }
- token_start = ptr;
- for ( ; num_code_points < end_offset; num_code_points++) {
- ptr += KINO_STRHELP_UTF8_SKIP[(chy_u8_t)*ptr];
- if (ptr > limit)
- CONFESS("scanned past end of '%s'", string_top);
- }
-
- /* calculate the start of the substring and add the token */
- token = kino_Token_new(
- token_start,
- (ptr - token_start),
- start_offset,
- end_offset,
- boost,
- 1
- );
- Kino_TokenBatch_Append(self, token);
- REFCOUNT_DEC(token);
- }
-}
-
-=for comment
-
-Take an array of Perl scalars and map their string contents to the texts for
-each token in the batch.
-
-=cut
-
-void
-set_all_texts(self, texts_av)
- kino_TokenBatch *self;
- AV *texts_av;
-PPCODE:
-{
- chy_i32_t i;
- const chy_i32_t max = av_len(texts_av);
-
- for (i = 0; i <= max; i++) {
- kino_Token *const token = (kino_Token*)Kino_TokenBatch_Fetch(self, i);
- SV **const sv_ptr = av_fetch(texts_av, i, 0);
- char *text;
- size_t len;
-
- if (sv_ptr == NULL)
- CONFESS("Encountered a null SV* pointer");
- text = SvPVutf8(*sv_ptr, len);
-
- if (token == NULL) {
- CONFESS("Batch size %d doesn't match array size %d",
- self->size, (max + 1));
- }
- free(token->text);
- token->text = kino_StrHelp_strndup(text, len);
- token->len = len;
- }
-}
-
-=for comment
-
-Return a Perl array whose elements correspond to the token texts in this
-batch.
-
-=cut
-
-void
-get_all_texts(self)
- kino_TokenBatch *self;
-PPCODE:
-{
- AV *const out_av = newAV();
- chy_u32_t i;
-
- for (i = 0; i < self->size; i++) {
- kino_Token *const token = (kino_Token*)Kino_TokenBatch_Fetch(self, i);
- SV *const text = newSVpvn(token->text, token->len);
- SvUTF8_on(text);
- av_push(out_av, text);
- }
-
- XPUSHs(sv_2mortal( newRV_noinc((SV*)out_av) ));
- XSRETURN(1);
-}
-
__AUTO_XS__

{ "KinoSearch::Analysis::TokenBatch" => {
@@ -254,20 +107,6 @@

Tack a Token onto the end of the batch.

-=head2 add_many_tokens
-
- $batch->add_many_tokens( $string, \@starts, \@ends );
- # or...
- $batch->add_many_tokens( $string, \@starts, \@ends, \@boosts );
-
-High efficiency method for adding multiple tokens to the batch with one call.
-The starts and ends, which must be specified in characters (not bytes), will
-be used to identify substrings of C<$string> to supply as token texts to
-Token->new.
-
-(Note: boosts should be supplied only for fields which are set to
-C<store_pos_boost>.)
-
=head2 next

while ( my $token = $batch->next ) {

Modified: trunk/perl/t/605-store_pos_boost.t
===================================================================
--- trunk/perl/t/605-store_pos_boost.t 2008-08-03 14:41:06 UTC (rev 3701)
+++ trunk/perl/t/605-store_pos_boost.t 2008-08-03 15:39:16 UTC (rev 3702)
@@ -14,21 +14,18 @@
for ( $token->get_text ) {
my $this_time = /z/ ? 1 : 0;
# Accumulate token start_offsets and end_offsets.
- my ( @starts, @ends, @boosts );
while (/(\w)/g) {
- push @starts, $-[0];
- push @ends, $+[0];
-
# Special boost just for one doc.
- if ( $1 eq 'a' and $this_time ) {
- push @boosts, 100;
- }
- else {
- push @boosts, 1;
- }
+ my $boost = ( $1 eq 'a' and $this_time ) ? 100 : 1;
+ $new_batch->append(
+ KinoSearch::Analysis::Token->new(
+ text => $1,
+ start_offset => $-[0],
+ end_offset => $+[0],
+ boost => $boost,
+ ),
+ );
}
-
- $new_batch->add_many_tokens( $_, \@starts, \@ends, \@boosts );
}
}



_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits