Mailing List Archive

my first shot at fuzzy matching module
I could not wait for Perl 6 :-) Outrageous errors and ghastly
inefficiencies possible/evitable but at least it matches pearl, pelt,
and peel for 'perl' (insert, delete, change, one character,
respectively). I have not yet done comparative research against
'agrep' to see how my solution fares (like matching identically...)

This excercise added two items to the letter to Santa Claus:

- overloading m// operator (and I would not mind overloading s///...)
- a way to find out "how many submatches did I
have in the latest regexp match" and a way to get
the submatch n via a function/array _without_ using the $n --
with these the eval I use below could be avoided by directly
length()ing the concatenation of all the submatches

After a little more testing and optimisation and a fuzzysubstitute()
I will naturally submit this to the CPAN/PAUSE.

---
package Text::Fuzzy;

require 5;

%R = (); # cache for the fuzzy match regexps
@L = (); # cache for the length expressions

sub fuzzymatch {
my ($str, $ins, $del, $chg, $mod) = @_;

return undef unless defined $str;
$ins = 1 unless defined $ins;
$del = 1 unless defined $del;
$chg = 1 unless defined $chg;
$mod = defined $mod ? "(?$mod)" : '';

my @str = split(//, $str);

$R{$str}->{"i$ins"} = join('(.*?)', @str)
unless (defined $R{$str}->{"i$ins"});

$R{$str}->{"d$del"} = join('', map { "($_?)" } @str)
unless (defined $R{$str}->{"d$del"});

$R{$str}->{"c$chg"} = join('', map { "(?:$_|(.))" } @str)
unless (defined $R{$str}->{"c$chg"});

$L[$#str] = 'length("'.join('', map { "\$$_" } 1..$#str+1).'")'
if ($L[$#str] eq '');

return 1
if (/$mod$R{$str}->{"i$ins"}/ and eval "$L[$#str] <= $ins");

return 1
if (/$mod$R{$str}->{"d$ins"}/ and eval "$L[$#str] > $#str - $del");

return 1
if (/$mod$R{$str}->{"c$ins"}/ and eval "$L[$#str] <= $chg");

0;
}

package main;

select(STDOUT); $| = 1;
while (<STDIN>) {
print if Text::FuzzyMatch::fuzzymatch(@ARGV);
}
---
++jhi;