Mailing List Archive

r3652 - in trunk/perl/lib: . KinoSearch/Analysis
Author: creamyg
Date: 2008-07-27 19:27:20 -0700 (Sun, 27 Jul 2008)
New Revision: 3652

Modified:
trunk/perl/lib/KinoSearch.pm
trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm
trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm
trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm
trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm
trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm
trunk/perl/lib/KinoSearch/Analysis/Token.pm
trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm
Log:
Migrate all Perl code from modules within KinoSearch::Analysis to
KinoSearch.pm.


Modified: trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/Analyzer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,26 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::Analyzer;
-use KinoSearch::base qw( KinoSearch::Obj );
-
-use KinoSearch::Doc;
-use KinoSearch::Analysis::TokenBatch;
-use KinoSearch::Util::StringHelper qw( utf8_flag_on );
-
-sub transform_field {
- my ( $self, $doc, $field_name ) = @_;
- my $batch = KinoSearch::Analysis::TokenBatch->new(
- text => $doc->{$field_name} );
- return $self->transform($batch);
-}
-
-sub split {
- my $retval = _split(@_)->to_perl;
- utf8_flag_on($_) for @$retval;
- return $retval;
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/LCNormalizer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,17 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::LCNormalizer;
-use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
-
-use KinoSearch::Analysis::Token;
-use KinoSearch::Analysis::TokenBatch;
-
-sub transform_field {
- return KinoSearch::Analysis::TokenBatch->new(
- text => lc( $_[1]->{ $_[2] } ) );
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/PolyAnalyzer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,51 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::PolyAnalyzer;
-use KinoSearch::Util::ToolSet qw( to_kino );
-use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
-
-our %instance_vars = __PACKAGE__->init_instance_vars(
- cached_analyzers => \our %cached_analyzers, );
-
-use KinoSearch::Analysis::LCNormalizer;
-use KinoSearch::Analysis::Tokenizer;
-use KinoSearch::Analysis::Stemmer;
-
-sub new {
- my ( $either, %args ) = @_;
- if ( defined $args{analyzers} ) {
- $args{analyzers} = to_kino( $args{analyzers} );
- }
- else {
- require Lingua::Stem::Snowball;
- require Lingua::StopWords;
- }
-
- my $self = $either->_new(%args);
- # Cache analyzers as Perl array as a lame-o optimization until
- # transform_field gets ported to C.
- $cached_analyzers{$$self} = $self->get_analyzers->to_perl;
- return $self;
-}
-
-sub transform_field {
- my $analyzers = $cached_analyzers{ ${ $_[0] } };
-
- if ( !@$analyzers ) {
- return KinoSearch::Analysis::TokenBatch->new(
- text => $_[1]->{ $_[2] } );
- }
- elsif ( @$analyzers == 1 ) {
- return $analyzers->[0]->transform_field( $_[1], $_[2] );
- }
- else {
- my $batch = $analyzers->[0]->transform_field( $_[1], $_[2] );
- $batch = $_->transform($batch) for @{$analyzers}[ 1 .. $#$analyzers ];
- return $batch;
- }
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/Stemmer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,15 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::Stemmer;
-use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
-
-sub new {
- my $either = shift;
- require Lingua::Stem::Snowball;
- return $either->_new(@_);
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/Stopalizer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,31 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::Stopalizer;
-use KinoSearch::Util::ToolSet qw( confess reftype to_kino );
-use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
-
-sub new {
- my ( $either, %args ) = @_;
- require Lingua::StopWords;
- if ( defined $args{stoplist} ) {
- confess("stoplist must be a hashref")
- unless reftype( $args{stoplist} ) eq 'HASH';
- $args{stoplist} = to_kino( $args{stoplist} );
- }
- return $either->_new(%args);
-}
-
-sub gen_stoplist {
- my ( undef, $language ) = @_;
- $language = lc($language);
- if ( $language =~ /^(?:da|de|en|es|fi|fr|it|nl|no|pt|ru|sv)$/ ) {
- my $stoplist = Lingua::StopWords::getStopWords( $language, 'UTF-8' );
- return to_kino($stoplist);
- }
- return undef;
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/Token.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Token.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/Token.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,17 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::Token;
-use KinoSearch::base qw( KinoSearch::Obj::FastObj );
-
-our %new_PARAMS = (
- text => undef,
- start_offset => undef,
- end_offset => undef,
- pos_inc => 1,
- boost => 1.0,
-);
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/TokenBatch.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,16 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::TokenBatch;
-use KinoSearch::base qw( KinoSearch::Util::VArray );
-
-our %instance_vars = (
- # params
- text => undef
-);
-
-use KinoSearch::Analysis::Token;
-
1;

__END__
@@ -27,7 +16,7 @@
/* parse params, only if there's more than one arg */
if (items > 1) {
HV *const args_hash = build_args_hash( &(ST(0)), 1, items,
- "KinoSearch::Analysis::TokenBatch::instance_vars");
+ "KinoSearch::Analysis::TokenBatch::new_PARAMS");
SV *text_sv = extract_sv(args_hash, SNL("text"));
STRLEN len;
char *text = SvPVutf8(text_sv, len);

Modified: trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch/Analysis/Tokenizer.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -1,33 +1,5 @@
-use strict;
-use warnings;
+use KinoSearch;

-package KinoSearch::Analysis::Tokenizer;
-use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
-
-our %instance_vars = __PACKAGE__->init_instance_vars(
- # constructor params / members
- token_re => \our %token_re,
-);
-
-use KinoSearch::Analysis::Token;
-use KinoSearch::Analysis::TokenBatch;
-
-sub new {
- my ( $either, %args ) = @_;
- my $self = $either->_new;
- if ( $args{token_re} ) {
- # Overwrite default, which has already been initialized via callback.
- _cache_token_re( $self, $args{token_re} );
- }
- return $self;
-}
-
-sub _cache_token_re {
- my ( $self, $token_re ) = @_;
- $token_re{$$self} = $token_re || qr/\w+(?:'\w+)*/;
- _do_cache_token_re( $self, $token_re{$$self} );
-}
-
1;

__END__

Modified: trunk/perl/lib/KinoSearch.pm
===================================================================
--- trunk/perl/lib/KinoSearch.pm 2008-07-28 02:00:28 UTC (rev 3651)
+++ trunk/perl/lib/KinoSearch.pm 2008-07-28 02:27:20 UTC (rev 3652)
@@ -26,6 +26,170 @@
our @EXPORT_OK = qw( kdump );

{
+ package KinoSearch::Analysis::Analyzer;
+ use KinoSearch::base qw( KinoSearch::Obj );
+ use KinoSearch::Util::StringHelper qw( utf8_flag_on );
+
+ sub transform_field {
+ my ( $self, $doc, $field_name ) = @_;
+ my $batch = KinoSearch::Analysis::TokenBatch->new(
+ text => $doc->{$field_name} );
+ return $self->transform($batch);
+ }
+
+ sub split {
+ my $retval = _split(@_)->to_perl;
+ utf8_flag_on($_) for @$retval;
+ return $retval;
+ }
+}
+
+{
+ package KinoSearch::Analysis::LCNormalizer;
+ use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
+
+ sub transform_field {
+ return KinoSearch::Analysis::TokenBatch->new(
+ text => lc( $_[1]->{ $_[2] } ) );
+ }
+}
+
+{
+ package KinoSearch::Analysis::PolyAnalyzer;
+ use KinoSearch::Util::ToolSet qw( to_kino );
+ use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
+
+ our %instance_vars = __PACKAGE__->init_instance_vars(
+ cached_analyzers => \our %cached_analyzers, );
+
+ sub new {
+ my ( $either, %args ) = @_;
+ if ( defined $args{analyzers} ) {
+ $args{analyzers} = to_kino( $args{analyzers} );
+ }
+ else {
+ require Lingua::Stem::Snowball;
+ require Lingua::StopWords;
+ }
+
+ my $self = $either->_new(%args);
+ # Cache analyzers as Perl array as a lame-o optimization until
+ # transform_field gets ported to C.
+ $cached_analyzers{$$self} = $self->get_analyzers->to_perl;
+ return $self;
+ }
+
+ sub transform_field {
+ my $analyzers = $cached_analyzers{ ${ $_[0] } };
+
+ if ( !@$analyzers ) {
+ return KinoSearch::Analysis::TokenBatch->new(
+ text => $_[1]->{ $_[2] } );
+ }
+ elsif ( @$analyzers == 1 ) {
+ return $analyzers->[0]->transform_field( $_[1], $_[2] );
+ }
+ else {
+ my $batch = $analyzers->[0]->transform_field( $_[1], $_[2] );
+ $batch = $_->transform($batch)
+ for @{$analyzers}[ 1 .. $#$analyzers ];
+ return $batch;
+ }
+ }
+}
+
+{
+ package KinoSearch::Analysis::Stemmer;
+ use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
+
+ sub new {
+ my $either = shift;
+ require Lingua::Stem::Snowball;
+ return $either->_new(@_);
+ }
+}
+
+{
+ package KinoSearch::Analysis::Stopalizer;
+ use KinoSearch::Util::ToolSet qw( confess reftype to_kino );
+ use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
+
+ sub new {
+ my ( $either, %args ) = @_;
+ require Lingua::StopWords;
+ if ( defined $args{stoplist} ) {
+ confess("stoplist must be a hashref")
+ unless reftype( $args{stoplist} ) eq 'HASH';
+ $args{stoplist} = to_kino( $args{stoplist} );
+ }
+ return $either->_new(%args);
+ }
+
+ sub gen_stoplist {
+ my ( undef, $language ) = @_;
+ $language = lc($language);
+ if ( $language =~ /^(?:da|de|en|es|fi|fr|it|nl|no|pt|ru|sv)$/ ) {
+ my $stoplist
+ = Lingua::StopWords::getStopWords( $language, 'UTF-8' );
+ return to_kino($stoplist);
+ }
+ return undef;
+ }
+}
+
+{
+ package KinoSearch::Analysis::Token;
+ use KinoSearch::base qw( KinoSearch::Obj::FastObj );
+
+ our %new_PARAMS = (
+ text => undef,
+ start_offset => undef,
+ end_offset => undef,
+ pos_inc => 1,
+ boost => 1.0,
+ );
+}
+
+{
+ package KinoSearch::Analysis::TokenBatch;
+ use KinoSearch::base qw( KinoSearch::Util::VArray );
+
+ our %new_PARAMS = (
+ # params
+ text => undef
+ );
+}
+
+{
+ package KinoSearch::Analysis::Tokenizer;
+ use KinoSearch::Util::ToolSet qw( confess verify_args kerror );
+ use KinoSearch::base qw( KinoSearch::Analysis::Analyzer );
+
+ our %instance_vars = __PACKAGE__->init_instance_vars(
+ # params/members
+ token_re => \our %token_re,
+ );
+
+ sub new {
+ my $either = shift;
+ confess kerror() unless verify_args( \%instance_vars, @_ );
+ my %args = @_;
+ my $self = $either->_new;
+ if ( $args{token_re} ) {
+ # Overwrite default, which has already been initialized via callback.
+ _cache_token_re( $self, $args{token_re} );
+ }
+ return $self;
+ }
+
+ sub _cache_token_re {
+ my ( $self, $token_re ) = @_;
+ $token_re{$$self} = $token_re || qr/\w+(?:'\w+)*/;
+ _do_cache_token_re( $self, $token_re{$$self} );
+ }
+}
+
+{
package KinoSearch::Doc;
use KinoSearch::Util::ToolSet qw( nfreeze thaw );
use KinoSearch::base qw( KinoSearch::Obj );


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits