Mailing List Archive: r3722 - in trunk: c_src/KinoSearch/Analysis perl/lib/KinoSearch/Analysis

Author: creamyg
Date: 2008-08-05 09:16:25 -0700 (Tue, 05 Aug 2008)
New Revision: 3722

Modified:
trunk/c_src/KinoSearch/Analysis/Analyzer.bp
trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
trunk/c_src/KinoSearch/Analysis/Stemmer.bp
trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
trunk/c_src/KinoSearch/Analysis/Token.bp
trunk/c_src/KinoSearch/Analysis/TokenBatch.bp
trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm
trunk/perl/lib/KinoSearch/Analysis/Token.pm
trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
Log:
Port docs for Analyzer, Token, and TokenBatch to C. Redact public APIs for
all three classes. Declare several Analyzer methods "public" so that they can
be overridden via Perl (so subclassing Analyzer is now an undocumented
feature.)

Modified: trunk/c_src/KinoSearch/Analysis/Analyzer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Analyzer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/Analyzer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -1,26 +1,40 @@
parcel KinoSearch cnick Kino;

+/** Base class for analyzers.
+ *
+ * An Analyzer is a filter which processes text, transforming it from one form
+ * into another. For instance, an analyzer might break up a long text into
+ * smaller pieces (L<Tokenizer|KinoSearch::Analysis::Tokenizer>), or it might
+ * convert text to lowercase
+ * (L<LCNormalizer|KinoSearch::Analysis::LCNormalizer>).
+ */
class KinoSearch::Analysis::Analyzer extends KinoSearch::Obj {

static Analyzer*
init(Analyzer *self);

- abstract incremented TokenBatch*
+ /** Take a single L<TokenBatch|KinoSearch::Analysis::TokenBatch> as input
+ * and returns a TokenBatch, either the same one (presumably transformed
+ * in some way), or a new one.
+ */
+ public abstract incremented TokenBatch*
Transform(Analyzer *self, TokenBatch *batch);

- /** Kick off an analysis chain, creating a TokenBatch. Occasionally
- * optimized to minimize string copies.
+ /** Kick off an analysis chain, creating a TokenBatch from string input.
+ * The default implementation simply creates an initial TokenBatch with a
+ * single Token, then calls Transform(), but occasionally subclasses will
+ * provide an optimized implementation which minimizes string copies.
*/
- incremented TokenBatch*
+ public incremented TokenBatch*
Transform_Text(Analyzer *self, CharBuf *text);

/** Analyze text and return an array of token texts.
*/
- incremented VArray*
+ public incremented VArray*
Split(Analyzer *self, CharBuf *text);
}

-/* Copyright 2006-2008 Marvin Humphrey
+/* Copyright 2005-2008 Marvin Humphrey
*
* This program is free software; you can redistribute it and/or modify
* under the same terms as Perl itself.

Modified: trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -11,10 +11,10 @@
static LCNormalizer*
init(LCNormalizer *self);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform(LCNormalizer *self, TokenBatch *batch);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform_Text(LCNormalizer *self, CharBuf *text);

void

Modified: trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -12,10 +12,10 @@
init(PolyAnalyzer *self, const CharBuf *language = NULL,
VArray *analyzers = NULL);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform(PolyAnalyzer *self, TokenBatch *batch);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform_Text(PolyAnalyzer *self, CharBuf *text);

void

Modified: trunk/c_src/KinoSearch/Analysis/Stemmer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -10,7 +10,7 @@
static Stemmer*
init(Stemmer *self, const CharBuf *language);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform(Stemmer *self, TokenBatch *batch);

void

Modified: trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -11,7 +11,7 @@
init(Stopalizer *self, const CharBuf *language = NULL,
Hash *stoplist = NULL);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform(Stopalizer *self, TokenBatch *batch);

void

Modified: trunk/c_src/KinoSearch/Analysis/Token.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Token.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/Token.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -1,5 +1,38 @@
parcel KinoSearch cnick Kino;

+/** Unit of text.
+ *
+ * Token is the fundamental unit used by KinoSearch's Analyzer subclasses.
+ * Each Token has 5 attributes: <code>text</code>, <code>start_offset</code>,
+ * <code>end_offset</code>, <code>boost</code>, and <code>pos_inc</code>.
+ *
+ * The <code>text</code> attribute is a Unicode string encoded as UTF-8.
+ *
+ * <code>start_offset</code> is the start point of the token text, measured in
+ * Unicode code points from the top of the stored field;
+ * <code>end_offset</code> delimits the corresponding closing boundary.
+ * <code>start_offset</code> and <code>end_offset</code> locate the Token
+ * within a larger context, even if the Token's text attribute gets modified
+ * -- by stemming, for instance. The Token for "beating" in the text "beating
+ * a dead horse" begins life with a start_offset of 0 and an end_offset of 7;
+ * after stemming, the text is "beat", but the start_offset is still 0 and the
+ * end_offset is still 7. This allows "beating" to be highlighted correctly
+ * after a search matches "beat".
+ *
+ * <code>boost</code> is a per-token weight. Use this when you want to assign
+ * more or less importance to a particular token, as you might for emboldened
+ * text within an HTML document, for example. (Note: The field this token
+ * belongs to must be spec'd to use a posting of type
+ * L<KinoSearch::Posting::RichPosting>.)
+ *
+ * <code>pos_inc</code is the POSition INCrement, measured in Tokens. This
+ * attribute, which defaults to 1, is a an advanced tool for manipulating
+ * phrase matching. Ordinarily, Tokens are assigned consecutive position
+ * numbers: 0, 1, and 2 for <code>"three blind mice"</code>. However, if you
+ * set the position increment for "blind" to, say, 1000, then the three tokens
+ * will end up assigned to positions 0, 1, and 1001 -- and will no longer
+ * produce a phrase match for the query <code>"three blind mice"</code>.
+ */
class KinoSearch::Analysis::Token extends KinoSearch::Obj::FastObj {

char *text;

Modified: trunk/c_src/KinoSearch/Analysis/TokenBatch.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/TokenBatch.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/TokenBatch.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -26,12 +26,12 @@
void
Append(TokenBatch *self, Token *token);

- /* Return the next token in the TokenBatch until out of tokens.
+ /** Return the next token in the TokenBatch until out of tokens.
*/
Token*
Next(TokenBatch *self);

- /* Reset the TokenBatch's iterator, so that the next call to next()
+ /** Reset the TokenBatch's iterator, so that the next call to next()
* returns the first Token in the batch.
*/
void

Modified: trunk/c_src/KinoSearch/Analysis/Tokenizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/c_src/KinoSearch/Analysis/Tokenizer.bp 2008-08-05 16:16:25 UTC (rev 3722)
@@ -11,10 +11,10 @@
static Tokenizer*
init(Tokenizer *self);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform(Tokenizer *self, TokenBatch *batch);

- incremented TokenBatch*
+ public incremented TokenBatch*
Transform_Text(Tokenizer *self, CharBuf *text);

/** Tokenize the supplied string and add any Tokens generated to the

Modified: trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm 2008-08-05 16:16:25 UTC (rev 3722)
@@ -12,63 +12,16 @@
_split|split )
],
make_constructors => ["new"],
+ make_pod => {
+ synopsis => " # Abstract base class.\n",
+ }
}
}

-__POD__
+__COPYRIGHT__

-=head1 NAME
-
-KinoSearch::Analysis::Analyzer - Base class for analyzers.
-
-=head1 SYNOPSIS
-
- # abstract base class -- must be subclassed
-
- package MyAnalyzer;
-
- sub transform {
- my ( $self, $token_batch ) = @_;
-
- while ( my $token = $token_batch->next ) {
- my $new_text = transform( $token->get_text );
- $token->set_text($new_text);
- }
-
- return $token_batch;
- }
-
- sub transform {
- # ...
- }
-
-=head1 DESCRIPTION
-
-In KinoSearch, an Analyzer is a filter which processes text, transforming it
-from one form into another. For instance, an analyzer might break up a long
-text into smaller pieces (L<Tokenizer|KinoSearch::Analysis::Tokenizer>), or it
-might convert text to lowercase
-(L<LCNormalizer|KinoSearch::Analysis::LCNormalizer>).
-
-=head1 SUBCLASSING
-
-All Analyzer subclasses must provide a C<transform> method.
-
-=head2 transform
-
- $token_batch = $analyzer->transform($token_batch);
-
-Abstract method. C<transform()> takes a single
-L<TokenBatch|KinoSearch::Analysis::TokenBatch> as input, and it returns a
-TokenBatch, either the same one (presumably transformed in some way), or a new
-one.
-
-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.

-See L<KinoSearch> version 0.20.
-
-=cut

Modified: trunk/perl/lib/KinoSearch/Analysis/Token.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Token.pm 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/perl/lib/KinoSearch/Analysis/Token.pm 2008-08-05 16:16:25 UTC (rev 3722)
@@ -66,109 +66,15 @@

=head1 NAME

-KinoSearch::Analysis::Token - Unit of text.
+KinoSearch::Analysis::Token - Redacted.

-=head1 SYNOPSIS
+=head1 REDACTED

- my $token = KinoSearch::Analysis::Token->new(
- text => 'horses',
- start_offset => 0,
- end_offset => 6,
- );
- $token->set_text('hors');
+Token's public API has been redacted.

-=head1 DESCRIPTION
-
-Token is the fundamental unit used by KinoSearch's Analyzer subclasses. Each
-Token has 5 attributes:
-
-=over
-
-=item *
-
-B<text> - a UTF-8 string.
-
-=item *
-
-B<start_offset> - The start point of the token text, measured in UTF-8
-characters from the top of the stored field. C<start_offset> and C<end_offset>
-locate the Token within a larger context, even if the Token's text attribute
-gets modified -- by stemming, for instance. The Token for "beating" in the
-text "beating a dead horse" begins life with a start_offset of 0 and an
-end_offset of 7; after stemming, the text is "beat", but the start_offset is
-still 0 and the end_offset is still 7. This allows "beating" to be
-highlighted correctly after a search matches "beat".
-
-=item *
-
-B<end_offset> The end of the token text, measured in UTF-8 characters from the
-top of the field.
-
-=item *
-
-B<boost> - a per-token weight. Use this when you want to assign more or less
-importance to a particular token, as you might for emboldened text within an
-HTML document, for example. (Note: The field this token belongs to must be
-spec'd to use a posting of type L<KinoSearch::Posting::RichPosting>.)
-
-=item *
-
-B<pos_inc> - POSition INCrement, measured in Tokens. This attribute, which
-defaults to 1, is a an advanced tool for manipulating phrase matching.
-Ordinarily, Tokens are assigned consecutive position numbers: 0, 1, and 2 for
-C<"three blind mice">. However, if you set the position increment for "blind"
-to, say, 1000, then the three tokens will end up assigned to positions 0, 1,
-and 1001 -- and will no longer produce a phrase match for the query C<"three
-blind mice">.
-
-=back
-
-=head1 METHODS
-
-=head2 new
-
- my $token = KinoSearch::Analysis::Token->new(
- text => $text, # required
- start_offset => 0, # required
- end_offset => length($text), # required
- boost => 100.0, # default 1.0
- pos_inc => 0, # default 1
- );
-
-Constructor. Takes hash-style parameters, corresponding to the token's
-attributes.
-
-=head2 Accessors
-
-Token provides these set/get methods:
-
-=over 4
-
-=item set_text
-
-=item get_text
-
-=item set_start_offset
-
-=item get_start_offset
-
-=item set_end_offset
-
-=item get_end_offset
-
-=item set_boost
-
-=item get_boost
-
-=item set_pos_inc
-
-=item get_pos_inc
-
-=back
-
=head1 COPYRIGHT

-Copyright 2006-2008 Marvin Humphrey
+Copyright 2005-2008 Marvin Humphrey

=head1 LICENSE, DISCLAIMER, BUGS, etc.

Modified: trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-05 02:15:41 UTC (rev 3721)
+++ trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-08-05 16:16:25 UTC (rev 3722)
@@ -39,89 +39,12 @@

=head1 NAME

-KinoSearch::Analysis::TokenBatch - A collection of tokens.
+KinoSearch::Analysis::TokenBatch - Redacted.

-=head1 SYNOPSIS
+=head1 REDACTED

- # Create a TokenBatch with a single Token.
- my $source_batch = KinoSearch::Analysis::TokenBatch->new(
- text => 'Key Lime Pie',
- );
+TokenBatch's public API has been redacted.

- # Lowercase and split text into multiple tokens, append to new batch.
- my $dest_batch = KinoSearch::Analysis::TokenBatch->new;
- while ( my $source_token = $source_batch->next ) {
- my $source_text = $source_token->get_text;
- while ( $source_text =~ /\s*?(\S+)/g ) {
- my $new_token = KinoSearch::Analysis::Token->new(
- text => lc($1),
- start_offset => $-[1],
- end_offset => $+[1],
- );
- $dest_batch->append($new_token);
- }
- }
-
- # Prints 'keylimepie'.
- while ( my $token = $dest_batch->next ) {
- print $token->get_text;
- }
-
-
-=head1 DESCRIPTION
-
-A TokenBatch is a collection of L<Tokens|KinoSearch::Analysis::Token> objects
-which you can add to, then iterate over.
-
-=head1 METHODS
-
-=head2 new
-
- my $batch = KinoSearch::Analysis::TokenBatch->new(
- text => $utf8_text,
- );
-
- # ... which is equivalent to:
- my $batch = KinoSearch::Analysis::TokenBatch->new;
- my $token = KinoSearch::Analysis::Token->new(
- text => $utf8_text,
- start_offset => 0,
- end_offset => length($utf8_text),
- );
- $batch->append($token);
-
-Constructor. Takes one optional hash-style argument.
-
-=over
-
-=item *
-
-B<text> - UTF-8 encoded text, used to prime the TokenBatch with a single
-initial L<Token|KinoSearch::Analysis::Token>.
-
-=back
-
-=head2 append
-
- $batch->append($token);
-
-Tack a Token onto the end of the batch.
-
-=head2 next
-
- while ( my $token = $batch->next ) {
- # ...
- }
-
-Return the next token in the TokenBatch, or C<undef> if out of tokens.
-
-=head2 reset
-
- $batch->reset;
-
-Reset the TokenBatch's iterator, so that the next call to next() returns the
-first Token in the batch.
-
=head1 COPYRIGHT

Copyright 2005-2008 Marvin Humphrey
@@ -131,3 +54,4 @@
See L<KinoSearch> version 0.20.

=cut
+

_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits