Mailing List Archive: r3725 - in trunk: c_src/KinoSearch/Analysis perl/lib/KinoSearch/Analysis

Author: creamyg
Date: 2008-08-05 16:36:17 -0700 (Tue, 05 Aug 2008)
New Revision: 3725

Modified:
trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
trunk/c_src/KinoSearch/Analysis/Stemmer.bp
trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm
trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm
trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm
trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm
Log:
Port docs for LCNormalizer, Stemmer, Stopalizer, and PolyAnalyzer to C.

Modified: trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/c_src/KinoSearch/Analysis/LCNormalizer.bp 2008-08-05 23:36:17 UTC (rev 3725)
@@ -1,5 +1,11 @@
parcel KinoSearch cnick Kino;

+/** Convert input to lower case.
+ *
+ * LCNormalizer is a subclass of Analyzer which converts all text to lower
+ * case, which is useful for case-insensitive searching.
+ */
+
class KinoSearch::Analysis::LCNormalizer
extends KinoSearch::Analysis::Analyzer {

@@ -8,6 +14,8 @@
static incremented LCNormalizer*
new();

+ /** Constructor. Takes no arguments.
+ */
static LCNormalizer*
init(LCNormalizer *self);

Modified: trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/c_src/KinoSearch/Analysis/PolyAnalyzer.bp 2008-08-05 23:36:17 UTC (rev 3725)
@@ -1,5 +1,30 @@
parcel KinoSearch cnick Kino;

+/** Multiple analyzers in series.
+ *
+ * A PolyAnalyzer is a series of L<Analyzers|KinoSearch::Analysis::Analyzer>,
+ * each of which will be called upon to "analyze" text in turn. You can
+ * either provide the Analyzers yourself, or you can specify a supported
+ * language, in which case a PolyAnalyzer consisting of an
+ * L<LCNormalizer|KinoSearch::Analysis::LCNormalizer>, a
+ * L<Tokenizer|KinoSearch::Analysis::Tokenizer>, and a
+ * L<Stemmer|KinoSearch::Analysis::Stemmer> will be generated for you.
+ *
+ * Supported languages:
+ *
+ * en => English,
+ * da => Danish,
+ * de => German,
+ * es => Spanish,
+ * fi => Finnish,
+ * fr => French,
+ * it => Italian,
+ * nl => Dutch,
+ * no => Norwegian,
+ * pt => Portuguese,
+ * ru => Russian,
+ * sv => Swedish,
+ */
class KinoSearch::Analysis::PolyAnalyzer
extends KinoSearch::Analysis::Analyzer {

@@ -8,6 +33,15 @@
static incremented PolyAnalyzer*
new(const CharBuf *language = NULL, VArray *analyzers = NULL);

+ /**
+ * @param language An ISO code from the list of supported languages.
+ * @param analyzers An array of Analyzers. The order of the analyzers
+ * matters. Don't put a Stemmer before a Tokenizer (can't stem whole
+ * documents or paragraphs -- just individual words), or a Stopalizer
+ * after a Stemmer (stemmed words, e.g. "themselv", will not appear in a
+ * stoplist). In general, the sequence should be: normalize, tokenize,
+ * stopalize, stem.
+ */
static PolyAnalyzer*
init(PolyAnalyzer *self, const CharBuf *language = NULL,
VArray *analyzers = NULL);

Modified: trunk/c_src/KinoSearch/Analysis/Stemmer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/c_src/KinoSearch/Analysis/Stemmer.bp 2008-08-05 23:36:17 UTC (rev 3725)
@@ -1,5 +1,14 @@
parcel KinoSearch cnick Kino;

+/** Reduce related words to a shared root.
+ *
+ * Stemmer is an L<Analyzer|KinoSearch::Analysis::Analyzer> which reduces
+ * related words to a root form (using the "Snowball" stemming library). For
+ * instance, "horse", "horses", and "horsing" all become "hors" -- so that a
+ * search for 'horse' will also match documents containing 'horses' and
+ * 'horsing'.
+ */
+
class KinoSearch::Analysis::Stemmer extends KinoSearch::Analysis::Analyzer {

void *snowstemmer;
@@ -7,6 +16,10 @@
static incremented Stemmer*
new(const CharBuf *language);

+ /**
+ * @param language A two-letter ISO code identifying a language supported
+ * by Snowball.
+ */
static Stemmer*
init(Stemmer *self, const CharBuf *language);

Modified: trunk/c_src/KinoSearch/Analysis/Stopalizer.bp
===================================================================
--- trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/c_src/KinoSearch/Analysis/Stopalizer.bp 2008-08-05 23:36:17 UTC (rev 3725)
@@ -1,5 +1,21 @@
parcel KinoSearch cnick Kino;

+/** Suppress a "stoplist" of common words.
+ *
+ * A "stoplist" is collection of "stopwords": words which are common enough to be
+ * of little value when determining search results. For example, so many
+ * documents in English contain "the", "if", and "maybe" that it may improve both
+ * performance and relevance to block them.
+ *
+ * Before filtering stopwords:
+ *
+ * ( "i", "am", "the", "walrus" )
+ *
+ * After filtering stopwords:
+ *
+ * ( "walrus" );
+ */
+
class KinoSearch::Analysis::Stopalizer extends KinoSearch::Analysis::Analyzer {

Hash *stoplist;
@@ -7,6 +23,10 @@
static incremented Stopalizer*
new(const CharBuf *language = NULL, Hash *stoplist = NULL);

+ /**
+ * @param stoplist A hash with stopwords as the keys.
+ * @param language The ISO code for a supported language.
+ */
static Stopalizer*
init(Stopalizer *self, const CharBuf *language = NULL,
Hash *stoplist = NULL);

Modified: trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm 2008-08-05 23:36:17 UTC (rev 3725)
@@ -6,42 +6,31 @@

__AUTO_XS__

-{ "KinoSearch::Analysis::LCNormalizer" => {
- make_constructors => ["new"],
- },
-}
-
-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::LCNormalizer - Convert input to lower case.
-
-=head1 SYNOPSIS
-
+my $synopsis = <<'END_SYNOPSIS';
my $lc_normalizer = KinoSearch::Analysis::LCNormalizer->new;

my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
analyzers => [ $lc_normalizer, $tokenizer, $stemmer ],
);
+END_SYNOPSIS

-=head1 DESCRIPTION
+my $constructor = <<'END_CONSTRUCTOR';
+ my $lc_normalizer = KinoSearch::Analysis::LCNormalizer->new;
+END_CONSTRUCTOR

-This class basically says C<lc($foo)> in a longwinded way which
-KinoSearch's Analysis apparatus can understand.
+{ "KinoSearch::Analysis::LCNormalizer" => {
+ make_constructors => ["new"],
+ make_pod => {
+ synopsis => $synopsis,
+ constructor => { sample => $constructor },
+ }
+ },
+}

-=head1 CONSTRUCTOR
+__COPYRIGHT__

-=head2 new
-
-Construct a new LCNormalizer. Takes no arguments.
-
-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.

-See L<KinoSearch> version 0.20.
-
-=cut

Modified: trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm 2008-08-05 23:36:17 UTC (rev 3725)
@@ -6,26 +6,22 @@

__AUTO_XS__

-{ "KinoSearch::Analysis::PolyAnalyzer" => {
- make_constructors => ["new"],
- make_getters => [qw( analyzers )],
- },
-}
+my $synopsis = <<'END_SYNOPSIS';
+ package MySchema;
+ use base qw( KinoSearch::Schema );
+
+ sub analyzer {
+ KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
+ }
+END_SYNOPSIS

-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::PolyAnalyzer - Multiple analyzers in series.
-
-=head1 SYNOPSIS
-
+my $constructor = <<'END_CONSTRUCTOR';
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new(
language => 'es',
);

# or...
-
+
my $lc_normalizer = KinoSearch::Analysis::LCNormalizer->new;
my $tokenizer = KinoSearch::Analysis::Tokenizer->new;
my $stemmer = KinoSearch::Analysis::Stemmer->new( language => 'en' );
@@ -36,64 +32,21 @@
$stemmer,
],
);
+END_CONSTRUCTOR

-=head1 DESCRIPTION
+{ "KinoSearch::Analysis::PolyAnalyzer" => {
+ make_constructors => ["new"],
+ make_getters => [qw( analyzers )],
+ make_pod => {
+ synopsis => $synopsis,
+ constructor => { sample => $constructor },
+ },
+ },
+}

-A PolyAnalyzer is a series of Analyzers -- objects which inherit from
-L<KinoSearch::Analysis::Analyzer> -- each of which will be called upon to
-"analyze" text in turn. You can either provide the Analyzers yourself, or you
-can specify a supported language, in which case a PolyAnalyzer consisting of
-an L<LCNormalizer|KinoSearch::Analysis::LCNormalizer>, a
-L<Tokenizer|KinoSearch::Analysis::Tokenizer>, and a
-L<Stemmer|KinoSearch::Analysis::Stemmer> will be generated for you.
+__COPYRIGHT__

-Supported languages:
-
- en => English,
- da => Danish,
- de => German,
- es => Spanish,
- fi => Finnish,
- fr => French,
- it => Italian,
- nl => Dutch,
- no => Norwegian,
- pt => Portuguese,
- ru => Russian,
- sv => Swedish,
-
-
-=head1 METHODS
-
-=head2 new()
-
-Constructor. Takes two possible hash-style parameters. If the parameter
-C<analyzers> is specified, it will override C<language> and no attempt will be
-made to generate a default set of Analyzers.
-
-=over
-
-=item
-
-B<language> - Must be an ISO code from the list of supported languages.
-
-=item
-
-B<analyzers> - Must be an arrayref. Each element in the array must inherit
-from KinoSearch::Analysis::Analyzer. The order of the analyzers matters.
-Don't put a Stemmer before a Tokenizer (can't stem whole documents or
-paragraphs -- just individual words), or a Stopalizer after a Stemmer (stemmed
-words, e.g. "themselv", will not appear in a stoplist). In general, the
-sequence should be: normalize, tokenize, stopalize, stem.
-
-=back
-
-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
-
-See L<KinoSearch> version 0.20.
-
-=cut
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself.

Modified: trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm 2008-08-05 23:36:17 UTC (rev 3725)
@@ -6,47 +6,34 @@

__AUTO_XS__

-{ "KinoSearch::Analysis::Stemmer" => {
- make_constructors => ["new"],
- },
-}
-
-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::Stemmer - Reduce related words to a shared root.
-
-=head1 SYNOPSIS
-
+my $synopsis = <<'END_SYNOPSIS';
my $stemmer = KinoSearch::Analysis::Stemmer->new( language => 'es' );

my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
analyzers => [ $lc_normalizer, $tokenizer, $stemmer ],
);

-=head1 DESCRIPTION
-
-Stemming reduces words to a root form. For instance, "horse", "horses",
-and "horsing" all become "hors" -- so that a search for 'horse' will also
-match documents containing 'horses' and 'horsing'.
-
This class is a wrapper around L<Lingua::Stem::Snowball>, so it supports the
same languages.
+END_SYNOPSIS

-=head1 METHODS
+my $constructor = <<'END_CONSTRUCTOR';
+ my $stemmer = KinoSearch::Analysis::Stemmer->new( language => 'es' );
+END_CONSTRUCTOR

-=head2 new
+{ "KinoSearch::Analysis::Stemmer" => {
+ make_constructors => ["new"],
+ make_pod => {
+ synopsis => $synopsis,
+ constructor => { sample => $constructor }
+ },
+ },
+}

-Create a new stemmer. Takes a single named parameter, C<language>, which must
-be a two-letter ISO code that Lingua::Stem::Snowball understands.
+__COPYRIGHT__

-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself

-See L<KinoSearch> version 0.20.
-
-=cut

Modified: trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm 2008-08-05 23:35:47 UTC (rev 3724)
+++ trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm 2008-08-05 23:36:17 UTC (rev 3725)
@@ -6,19 +6,7 @@

__AUTO_XS__

-{ "KinoSearch::Analysis::Stopalizer" => {
- make_constructors => ["new"],
- },
-}
-
-__POD__
-
-=head1 NAME
-
-KinoSearch::Analysis::Stopalizer - Suppress a "stoplist" of common words.
-
-=head1 SYNOPSIS
-
+my $synopsis = <<'END_SYNOPSIS';
my $stopalizer = KinoSearch::Analysis::Stopalizer->new(
language => 'fr',
);
@@ -26,23 +14,11 @@
analyzers => [ $lc_normalizer, $tokenizer, $stopalizer, $stemmer ],
);

-=head1 DESCRIPTION
+This class uses Lingua::StopWords for its default stoplists, so it supports
+the same set of languages.
+END_SYNOPSIS

-A "stoplist" is collection of "stopwords": words which are common enough to be
-of little value when determining search results. For example, so many
-documents in English contain "the", "if", and "maybe" that it may improve both
-performance and relevance to block them.
-
- # before
- @token_texts = ('i', 'am', 'the', 'walrus');
-
- # after
- @token_texts = ('walrus');
-
-=head1 CONSTRUCTOR
-
-=head2 new
-
+my $constructor = <<'END_CONSTRUCTOR';
my $stopalizer = KinoSearch::Analysis::Stopalizer->new(
language => 'de',
);
@@ -51,36 +27,21 @@
my $stopalizer = KinoSearch::Analysis::Stopalizer->new(
stoplist => \%stoplist,
);
+END_CONSTRUCTOR

+{ "KinoSearch::Analysis::Stopalizer" => {
+ make_constructors => ["new"],
+ make_pod => {
+ synopsis => $synopsis,
+ constructor => { sample => $constructor }
+ },
+ },
+}

-new() takes two possible parameters, C<language> and C<stoplist>. If
-C<stoplist> is supplied, it will be used, overriding the behavior indicated by
-the value of C<language>.
+__COPYRIGHT__

-=over
-
-=item
-
-B<stoplist> - must be a hashref, with stopwords as the keys of the hash and
-values set to 1.
-
-=item
-
-B<language> - must be the ISO code for a language. Loads a default stoplist
-supplied by L<Lingua::StopWords>.
-
-=back
-
-=head1 SEE ALSO
-
-L<Lingua::StopWords>
-
-=head1 COPYRIGHT
-
Copyright 2005-2008 Marvin Humphrey

-=head1 LICENSE, DISCLAIMER, BUGS, etc.
+This program is free software; you can redistribute it and/or modify
+under the same terms as Perl itself

-See L<KinoSearch> version 0.20.
-
-=cut

_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits