Mailing List Archive: r3727 - in trunk: c_src/KinoSearch/Analysis perl/lib perl/lib/KinoSearch/Analysis perl/xs/KinoSearch/Analysis

Author: creamyg
Date: 2008-08-06 12:37:36 -0700 (Wed, 06 Aug 2008)
New Revision: 3727

Modified:
trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c
trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
trunk/c_src/KinoSearch/Analysis/Tokenizer.c
trunk/perl/lib/KinoSearch.pm
trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm
trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c
Log:
Change official public API for Tokenizer to take a "pattern" instead of a
"token_re", but retain "token_re" for backwards compatibility. Change the
default pattern to match Unicode apostrophe U+2019 in addition to the standard
ASCII apostrophe. Port Tokenizer documentation to C.

Modified: trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.c 2008-08-06 19:37:36 UTC (rev 3727)
@@ -24,7 +24,7 @@
}
else if (language) {
LCNormalizer *lc_normalizer = LCNormalizer_new();
- Tokenizer *tokenizer = Tokenizer_new();
+ Tokenizer *tokenizer = Tokenizer_new(NULL);
Stemmer *stemmer = Stemmer_new(language);
self->analyzers = VA_new(3);
VA_Push(self->analyzers, (Obj*)lc_normalizer);

Modified: trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-06 19:37:36 UTC (rev 3727)
@@ -1,15 +1,53 @@
parcel KinoSearch cnick Kino;

+/** Customizable tokenizing.
+ *
+ * Generically, "tokenizing" is a process of breaking up a string into an array
+ * of "tokens". For instance, the string "three blind mice" might be tokenized
+ * into "three", "blind", "mice".
+ *
+ * KinoSearch::Analysis::Tokenizer decides where it should break up the text
+ * based on a regular expression compiled from a supplied <code>pattern</code>
+ * matching one token. If our source string is...
+ *
+ * "Eats, Shoots and Leaves."
+ *
+ * ... then a "whitespace tokenizer" with a <code>pattern</code> of <code>\S+</code>
+ * produces...
+ *
+ * Eats,
+ * Shoots
+ * and
+ * Leaves.
+ *
+ * ... while a "word character tokenizer" with a <code>pattern</code> of
+ * <code>\w+</code> produces...
+ *
+ * Eats
+ * Shoots
+ * and
+ * Leaves
+ *
+ * ... the difference being that the word character tokenizer skips over
+ * punctuation as well as whitespace when determining token boundaries.
+ */
class KinoSearch::Analysis::Tokenizer
extends KinoSearch::Analysis::Analyzer {

- void *token_re;
+ CharBuf *pattern;
+ void *token_re;

static incremented Tokenizer*
- new();
+ new(const CharBuf *pattern = NULL);

+ /**
+ * @param pattern A string specifying a Perl-syntax regular expression
+ * which should match one token. The default value is
+ * <code>\w+(?:[\x{2019}']\w+)*</code>, which matches "it's" as well as
+ * "it" and "O'Henry's" as well as "Henry".
+ */
static Tokenizer*
- init(Tokenizer *self);
+ init(Tokenizer *self, const CharBuf *pattern = NULL);

public incremented Inversion*
Transform(Tokenizer *self, Inversion *inversion);
@@ -24,7 +62,13 @@
Tokenize_Str(Tokenizer *self, const char *text, size_t len,
Inversion *inversion);

+ /** Set the compile regular expression for matching a token. TODO: should
+ * also set pattern.
+ */
void
+ Set_Token_RE(Tokenizer *self, void *token_re);
+
+ void
Destroy(Tokenizer *self);
}

Modified: trunk/c_src/KinoSearch/Analysis/Tokenizer.c
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Tokenizer.c 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/c_src/KinoSearch/Analysis/Tokenizer.c 2008-08-06 19:37:36 UTC (rev 3727)
@@ -6,21 +6,12 @@
#include "KinoSearch/Util/Native.h"

Tokenizer*
-Tokenizer_new()
+Tokenizer_new(const CharBuf *pattern)
{
Tokenizer *self = (Tokenizer*)CREATE(NULL, TOKENIZER);
- return Tokenizer_init(self);
+ return Tokenizer_init(self, pattern);
}

-Tokenizer*
-Tokenizer_init(Tokenizer *self)
-{
- Analyzer_init((Analyzer*)self);
- self->token_re = NULL;
- Native_callback(self, "_cache_token_re", 0);
- return self;
-}
-
Inversion*
Tokenizer_transform(Tokenizer *self, Inversion *inversion)
{
@@ -43,12 +34,6 @@
return new_inversion;
}

-void
-Tokenizer_destroy(Tokenizer *self)
-{
- FREE_OBJ(self);
-}
-
/* Copyright 2005-2008 Marvin Humphrey
*
* This program is free software; you can redistribute it and/or modify

Modified: trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm 2008-08-06 19:37:36 UTC (rev 3727)
@@ -6,110 +6,42 @@

__AUTO_XS__

-{ "KinoSearch::Analysis::Tokenizer" => {
- make_constructors => ["_new"],
- },
-}
-
-__XS__
-
-MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::Tokenizer
-
-void
-_do_cache_token_re(self, token_re_sv)
- kino_Tokenizer *self;
- SV *token_re_sv;
-PPCODE:
-{
- MAGIC *mg = NULL;
- /* Extract regexp struct from qr// entity. */
- if (SvROK(token_re_sv)) {
- SV *sv = SvRV(token_re_sv);
- if (SvMAGICAL(sv))
- mg = mg_find(sv, PERL_MAGIC_qr);
- }
- if (!mg)
- CONFESS("not a qr// entity");
- self->token_re = mg->mg_obj;
-}
-
-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::Tokenizer - Customizable tokenizing.
-
-=head1 SYNOPSIS
-
+my $synopsis = <<'END_SYNOPSIS';
my $whitespace_tokenizer
- = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/, );
+ = KinoSearch::Analysis::Tokenizer->new( pattern => '\S+' );

# or...
my $word_char_tokenizer
- = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\w+/, );
+ = KinoSearch::Analysis::Tokenizer->new( pattern => '\w+' );

# or...
my $apostrophising_tokenizer = KinoSearch::Analysis::Tokenizer->new;

- # then... once you have a tokenizer, put it into a PolyAnalyzer
+ # Then... once you have a tokenizer, put it into a PolyAnalyzer:
my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
analyzers => [ $lc_normalizer, $word_char_tokenizer, $stemmer ], );
+END_SYNOPSIS

-
-=head1 DESCRIPTION
-
-Generically, "tokenizing" is a process of breaking up a string into an array
-of "tokens".
-
- # before:
- my $string = "three blind mice";
-
- # after:
- @tokens = qw( three blind mice );
-
-KinoSearch::Analysis::Tokenizer decides where it should break up the text
-based on the value of C<token_re>.
-
- # before:
- my $string = "Eats, Shoots and Leaves.";
-
- # tokenized by $whitespace_tokenizer
- @tokens = qw( Eats, Shoots and Leaves. );
-
- # tokenized by $word_char_tokenizer
- @tokens = qw( Eats Shoots and Leaves );
-
-=head1 METHODS
-
-=head2 new
-
- # match "it's" as well as "it" and "O'Henry's" as well as "Henry"
- my $token_re = qr/
- \w+ # Match word chars.
- (?: # Group, but don't capture...
- '\w+ # ... an apostrophe plus word chars.
- )* # Matching the apostrophe group is optional.
- /xsm;
- my $tokenizer = KinoSearch::Analysis::Tokenizer->new(
- token_re => $token_re, # default: what you see above
+my $constructor = <<'END_CONSTRUCTOR';
+ my $word_char_tokenizer = KinoSearch::Analysis::Tokenizer->new(
+ pattern => '\w+', # required
);
+END_CONSTRUCTOR

-Constructor. Takes one hash style parameter.
+{ "KinoSearch::Analysis::Tokenizer" => {
+ bind_methods => [qw( set_token_re )],
+ make_constructors => ["_new"],
+ make_pod => {
+ constructor => { sample => $constructor },
+ synopsis => $synopsis,
+ },
+ },
+}

-=over
+__COPYRIGHT__

-=item *
-
-B<token_re> - must be a pre-compiled regular expression matching one token.
-
-=back
-
-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.

-See L<KinoSearch> version 0.20.
-
-=cut

Modified: trunk/perl/lib/KinoSearch.pm
===================================================================
--- trunk/perl/lib/KinoSearch.pm 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/perl/lib/KinoSearch.pm 2008-08-06 19:37:36 UTC (rev 3727)
@@ -115,31 +115,16 @@
{
package KinoSearch::Analysis::Tokenizer;

- # Inside-out member var.
- our %token_re;
+ sub compile_token_re { return qr/$_[1]/ }

sub new {
my ( $either, %args ) = @_;
my $token_re = delete $args{token_re};
+ $args{pattern} = "$token_re" if $token_re;
my $self = $either->_new(%args);
- if ( $token_re ) {
- # Overwrite default, which has already been initialized via callback.
- _cache_token_re( $self, $token_re );
- }
+ $self->set_token_re($token_re) if $token_re;
return $self;
}
-
- sub DESTROY {
- my $self = shift;
- delete $token_re{$$self};
- $self->SUPER::DESTROY;
- }
-
- sub _cache_token_re {
- my ( $self, $token_re ) = @_;
- $token_re{$$self} = $token_re || qr/\w+(?:'\w+)*/;
- _do_cache_token_re( $self, $token_re{$$self} );
- }
}

{

Modified: trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c
===================================================================
--- trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c 2008-08-06 19:30:20 UTC (rev 3726)
+++ trunk/perl/xs/KinoSearch/Analysis/Tokenizer.c 2008-08-06 19:37:36 UTC (rev 3727)
@@ -3,9 +3,51 @@
#include "KinoSearch/Analysis/Tokenizer.h"
#include "KinoSearch/Analysis/Token.h"
#include "KinoSearch/Analysis/Inversion.h"
+#include "KinoSearch/Util/Native.h"
#include "KinoSearch/Util/StringHelper.h"

+kino_Tokenizer*
+kino_Tokenizer_init(kino_Tokenizer *self, const kino_CharBuf *pattern)
+{
+ SV *token_re_sv;
+
+ kino_Analyzer_init((kino_Analyzer*)self);
+ #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*"
+ self->pattern = pattern
+ ? Kino_CB_Clone(pattern)
+ : kino_CB_new_from_trusted_utf8(DEFAULT_PATTERN,
+ sizeof(DEFAULT_PATTERN) - 1);
+
+ /* Acquire a compiled regex engine for matching one token. */
+ token_re_sv = kino_Native_callback_nat(&KINO_TOKENIZER,
+ "compile_token_re", 1, KINO_ARG_STR("pattern", self->pattern));
+ Kino_Tokenizer_Set_Token_RE(self, SvRV(token_re_sv));
+
+ return self;
+}
+
void
+kino_Tokenizer_set_token_re(kino_Tokenizer *self, void *token_re)
+{
+ MAGIC *magic = NULL;
+ if (SvMAGICAL((SV*)token_re))
+ magic = mg_find((SV*)token_re, PERL_MAGIC_qr);
+ if (!magic)
+ CONFESS("token_re is not a qr// entity");
+ if (self->token_re) ReREFCNT_dec(((REGEXP*)self->token_re));
+ self->token_re = magic->mg_obj;
+ (void)ReREFCNT_inc(((REGEXP*)self->token_re));
+}
+
+void
+kino_Tokenizer_destroy(kino_Tokenizer *self)
+{
+ REFCOUNT_DEC(self->pattern);
+ ReREFCNT_dec(((REGEXP*)self->token_re));
+ KINO_FREE_OBJ(self);
+}
+
+void
kino_Tokenizer_tokenize_str(kino_Tokenizer *self, const char *string,
size_t string_len, kino_Inversion *inversion)
{

_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits