Mailing List Archive

r3810 - in trunk: c_src/KinoSearch/Search perl/lib/KinoSearch/Search perl/t
Author: creamyg
Date: 2008-08-31 15:24:02 -0700 (Sun, 31 Aug 2008)
New Revision: 3810

Modified:
trunk/c_src/KinoSearch/Search/Similarity.bp
trunk/perl/lib/KinoSearch/Search/Similarity.pm
trunk/perl/t/504-similarity.t
Log:
Make it possible to override a bunch of Similarity methods from Perl. Improve
testing of Similarity.


Modified: trunk/c_src/KinoSearch/Search/Similarity.bp
===================================================================
--- trunk/c_src/KinoSearch/Search/Similarity.bp 2008-08-31 19:04:12 UTC (rev 3809)
+++ trunk/c_src/KinoSearch/Search/Similarity.bp 2008-08-31 22:24:02 UTC (rev 3810)
@@ -36,21 +36,20 @@
* freqs, since the more times a doc matches, the more relevant it is
* likely to be.
*/
- float
+ public float
TF(Similarity *self, float freq);

/** Calculate the Inverse Document Frequecy for a Term in a given
* collection (the Searchable represents the collection).
*/
- float
+ public float
IDF(Similarity *self, Searchable *searchable, const CharBuf *field,
Obj *term);

/** Calculate a score factor based on the number of terms which match.
*/
- float
- Coord(Similarity *self, u32_t overlap,
- u32_t max_overlap);
+ public float
+ Coord(Similarity *self, u32_t overlap, u32_t max_overlap);

/** Dampen the scores of long documents.
*
@@ -73,7 +72,7 @@

/** Normalize a Query's weight so that it is comparable to other Queries.
*/
- float
+ public float
Query_Norm(Similarity *self, float sum_of_squared_weights);

/** encode_norm and decode_norm encode and decode between 32-bit IEEE

Modified: trunk/perl/lib/KinoSearch/Search/Similarity.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Search/Similarity.pm 2008-08-31 19:04:12 UTC (rev 3809)
+++ trunk/perl/lib/KinoSearch/Search/Similarity.pm 2008-08-31 22:24:02 UTC (rev 3810)
@@ -32,7 +32,8 @@
Encode_Norm
Decode_Norm
Query_Norm
- Length_Norm )
+ Length_Norm
+ Coord )
],
make_constructors => ["new"],
make_pod => {
@@ -54,15 +55,6 @@
RETVAL = newSVpvn( (char*)self->norm_decoder, (256 * sizeof(float)) );
OUTPUT: RETVAL

-float
-coord(self, overlap, max_overlap)
- kino_Similarity *self;
- chy_u32_t overlap;
- chy_u32_t max_overlap;
-CODE:
- RETVAL = Kino_Sim_Coord(self, overlap, max_overlap);
-OUTPUT: RETVAL
-
__COPYRIGHT__

Copyright 2005-2008 Marvin Humphrey

Modified: trunk/perl/t/504-similarity.t
===================================================================
--- trunk/perl/t/504-similarity.t 2008-08-31 19:04:12 UTC (rev 3809)
+++ trunk/perl/t/504-similarity.t 2008-08-31 22:24:02 UTC (rev 3810)
@@ -1,6 +1,32 @@
use strict;
use warnings;

+package MockSearchable;
+use base qw( KinoSearch::Search::Searchable );
+
+our %doc_freqs;
+
+sub new {
+ my ( $class, %args ) = @_;
+ my $doc_freqs = delete $args{doc_freqs};
+ my $self = $class->SUPER::new(%args);
+ $doc_freqs{$$self} = $doc_freqs;
+ return $self;
+}
+
+sub DESTROY {
+ my $self = shift;
+ delete $doc_freqs{$$self};
+ $self->SUPER::DESTROY;
+}
+
+sub doc_freq {
+ my ( $self, %args ) = @_;
+ return $doc_freqs{$$self}{ $args{term}->to_perl }
+}
+
+sub max_docs { 100 }
+
package MySchema::LongField;
use base qw( KinoSearch::FieldSpec::TextField );
use KSx::Search::LongFieldSim;
@@ -18,7 +44,7 @@
sub analyzer { KinoSearch::Analysis::Tokenizer->new }

package main;
-use Test::More tests => 5;
+use Test::More tests => 8;

use KinoSearch::Search::Similarity;
use KinoSearch::Store::RAMFolder;
@@ -30,6 +56,32 @@

my $sim = KinoSearch::Search::Similarity->new;

+cmp_ok( $sim->tf(10) - $sim->tf(9), '<', 1, "TF is damped" );
+
+my $mock_searchable = MockSearchable->new(
+ schema => MySchema->new,
+ doc_freqs => {
+ foo => 3,
+ bar => 200,
+ },
+);
+my $foo_idf = $sim->idf(
+ searchable => $mock_searchable,
+ field => 'title',
+ term => 'foo'
+);
+my $bar_idf = $sim->idf(
+ searchable => $mock_searchable,
+ field => 'title',
+ term => 'bar'
+);
+cmp_ok( $foo_idf, '>', $bar_idf, 'Rarer terms have higher IDF' );
+
+my $less_coordinated = $sim->coord( overlap => 2, max_overlap => 5 );
+my $more_coordinated = $sim->coord( overlap => 3, max_overlap => 5 );
+cmp_ok( $less_coordinated, '<', $more_coordinated,
+ "greater overlap means bigger coord bonus" );
+
my @bytes = ( 100, 110, 120, 130, 140 );
my @floats = ( 0.015625, 0.09375, 0.5, 3.0, 16.0 );
my @transformed = map { $sim->decode_norm($_) } @bytes;


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits