Mailing List Archive

r3790 - in trunk/perl: lib/KinoSearch/Docs/Tutorial sample
Author: creamyg
Date: 2008-08-29 11:18:13 -0700 (Fri, 29 Aug 2008)
New Revision: 3790

Modified:
trunk/perl/lib/KinoSearch/Docs/Tutorial/Simple.pod
trunk/perl/sample/invindexer.pl
Log:
Revise the first Tutorial chapter, using HTML::TreeBuilder to parse files and
freeing us from the need to explain why tag-stripping with regexes is a bad
idea.


Modified: trunk/perl/lib/KinoSearch/Docs/Tutorial/Simple.pod
===================================================================
--- trunk/perl/lib/KinoSearch/Docs/Tutorial/Simple.pod 2008-08-29 18:16:40 UTC (rev 3789)
+++ trunk/perl/lib/KinoSearch/Docs/Tutorial/Simple.pod 2008-08-29 18:18:13 UTC (rev 3790)
@@ -41,6 +41,7 @@

use KSx::Simple;
use File::Spec::Functions qw( catfile );
+ use HTML::TreeBuilder;

... we'll start by creating a KSx::Simple object, telling it where we'd
like the index to be located and the language of the source material.
@@ -50,66 +51,49 @@
language => 'en',
);

-Next, we'll add a subroutine which reads in and extracts plain text from an
-HTML source file. KSx::Simple won't be of any help with this task,
-because it's not equipped to deal with source files directly -- as a matter of
+Next, we'll add a subroutine which extracts plain text from an HTML source
+file.
+
+KSx::Simple won't be of any help with the task of text extraction, because
+it's not equipped to deal with source files directly. As a matter of
principle, KinoSearch remains deliberately ignorant on the vast subject of
file formats, preferring to focus instead on its core competencies of indexing
-and search.
+and search. There are many excellent dedicated parsing modules available on
+CPAN; we'll use HTML::TreeBuilder.

-There are many excellent dedicated parsing modules available on CPAN, and
-ordinarily we'd be calling on HTML::Parser or the like... however, today we're
-going to use quick-and-dirty regular expressions for the sake of simplicity.
-Parsing HTML using regexes is generally an awful idea, but we can guarantee
-that the following fragile-but-easy-to-grok parsing sub will work because the
-source docs are 100% controlled by us and we can ensure that they are
-well-formed.
-
# Parse an HTML file from our US Constitution collection and return a
# hashref with three keys: title, body, and url.
- sub slurp_and_parse_file {
+ sub parse_file {
my $filename = shift;
my $filepath = catfile( $conf->{uscon_source}, $filename );
- open( my $fh, '<', $filepath )
- or die "Can't open '$filepath': $!";
- my $raw = do { local $/; <$fh> }; # slurp!
-
- # Build up a document hash.
- my %doc = ( url => "/us_constitution/$filename" );
- $raw =~ m#<title>(.*?)</title>#s
- or die "couldn't isolate title in '$filepath'";
- $doc{title} = $1;
- $raw =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
- or die "couldn't isolate bodytext in '$filepath'";
- $doc{content} = $1;
- $doc{content} =~ s/<.*?>/ /gsm; # quick and dirty tag stripping
-
- return \%doc;
+ my $tree = HTML::TreeBuilder->new;
+ $tree->parse_file($filepath);
+ my $title_node = $tree->look_down( _tag => 'title' )
+ or die "No title element in $filepath";
+ my $bodytext_node = $tree->look_down( id => 'bodytext' )
+ or die "No div with id 'bodytext' in $filepath";
+ return {
+ title => $title_node->as_trimmed_text,
+ content => $bodytext_node->as_trimmed_text,
+ url => "/us_constitution/$filename"
+ };
}

Add some elementary directory reading code...

# Collect names of source html files.
- opendir( my $source_dh, $conf->{uscon_source} )
+ opendir( my $dh, $conf->{uscon_source} )
or die "Couldn't opendir '$conf->{uscon_source}': $!";
- my @filenames;
- for my $filename ( readdir $source_dh ) {
- next unless $filename =~ /\.html/;
- next if $filename eq 'index.html';
- push @filenames, $filename;
- }
- closedir $source_dh
- or die "Couldn't closedir '$conf->{uscon_source}': $!";
+ my @filenames = grep { $_ =~ /\.html/ && $_ ne 'index.html' } readdir $dh;

-... and now we're ready for the meat of invindexer.pl:
+... and now we're ready for the meat of invindexer.pl -- which occupies one line
+of code.

foreach my $filename (@filenames) {
- my $doc = slurp_and_parse_file($filename);
+ my $doc = parse_file($filename);
$simple->add_doc($doc); # ta-da!
}

-That's all there is to it.
-
=head2 Search: search.cgi

As with our indexing app, the bulk of the code in our search script won't be
@@ -154,8 +138,8 @@
break up results into "pages" of manageable size.

Calling search() on our Simple object turns it into an iterator. Invoking
-next() now returns our stored documents (augmented with a score,
-accessible via C<get_score>), starting with the most relevant.
+next() now returns hits one at a time as L<KinoSearch::Doc::HitDoc> objects,
+starting with the most relevant.

# Create result list.
my $report = '';
@@ -312,9 +296,9 @@

=head2 OK... now what?

-KSx::Simple is perfectly adequate for some tasks, but it's not very
-flexible. Many people will find that it doesn't do at least one or two things
-they can't live without.
+KSx::Simple is perfectly adequate for some tasks, but it's not very flexible.
+Many people find that it doesn't do at least one or two things they can't live
+without.

In our next tutorial chapter,
L<BeyondSimple|KinoSearch::Docs::Tutorial::BeyondSimple>, we'll rewrite our

Modified: trunk/perl/sample/invindexer.pl
===================================================================
--- trunk/perl/sample/invindexer.pl 2008-08-29 18:16:40 UTC (rev 3789)
+++ trunk/perl/sample/invindexer.pl 2008-08-29 18:18:13 UTC (rev 3790)
@@ -10,26 +10,21 @@
use File::Spec::Functions qw( catfile );
use USConSchema;
use KinoSearch::InvIndexer;
+use HTML::TreeBuilder;

# Create an InvIndexer object.
my $invindexer = KinoSearch::InvIndexer->new(
invindex => USConSchema->clobber( $conf->{path_to_invindex} ) );

# Collect names of source html files.
-opendir( my $source_dh, $conf->{uscon_source} )
+opendir( my $dh, $conf->{uscon_source} )
or die "Couldn't opendir '$conf->{uscon_source}': $!";
-my @filenames;
-for my $filename ( readdir $source_dh ) {
- next unless $filename =~ /\.html/;
- next if $filename eq 'index.html';
- push @filenames, $filename;
-}
-closedir $source_dh or die "Couldn't closedir '$conf->{uscon_source}': $!";
+my @filenames = grep { $_ =~ /\.html/ && $_ ne 'index.html' } readdir $dh;

# Iterate over list of source files.
for my $filename (@filenames) {
print "Indexing $filename\n";
- my $doc = slurp_and_parse_file($filename);
+ my $doc = parse_file($filename);
$invindexer->add_doc($doc);
}

@@ -39,23 +34,19 @@

# Parse an HTML file from our US Constitution collection and return a
# hashref with three keys: title, body, and url.
-sub slurp_and_parse_file {
+sub parse_file {
my $filename = shift;
my $filepath = catfile( $conf->{uscon_source}, $filename );
- open( my $fh, '<', $filepath )
- or die "Can't open '$filepath': $!";
- my $raw = do { local $/; <$fh> };
-
- # build up a document hash
- my %doc = ( url => "/us_constitution/$filename" );
- $raw =~ m#<title>(.*?)</title>#s
- or die "couldn't isolate title in '$filepath'";
- $doc{title} = $1;
- $raw =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
- or die "couldn't isolate bodytext in '$filepath'";
- $doc{content} = $1;
- $doc{content} =~ s/<.*?>/ /gsm; # quick and dirty tag stripping
-
- return \%doc;
+ my $tree = HTML::TreeBuilder->new;
+ $tree->parse_file($filepath);
+ my $title_node = $tree->look_down( _tag => 'title' )
+ or die "No title element in $filepath";
+ my $bodytext_node = $tree->look_down( id => 'bodytext' )
+ or die "No div with id 'bodytext' in $filepath";
+ return {
+ title => $title_node->as_trimmed_text,
+ content => $bodytext_node->as_trimmed_text,
+ url => "/us_constitution/$filename"
+ };
}



_______________________________________________
kinosearch-commits mailing list
kinosearch-commits@rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch-commits