Mailing List Archive: HTMLParser

HTMLParser

Feb 15, 2002, 2:42 PM

Post #1 of 9 (1366 views)

Hi,

I was playing with HTMLParser.jj and made some changes you might be interested in. What I did was start handling <META> tags (added
new methods: getAuthor, getKeywords and getMetadata and changed getSummary to check if there's any metadata item with
name=="description"). I'm also filtering out any text inside <STYLE>...</STYLE> (like <SCRIPT> is being handled).
I've performed some tests and I belive I didn't break anything ;-)

The patch is as follows

Best regards,

--Daniel

Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
+++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
@@ -66,6 +66,8 @@
package org.apache.lucene.demo.html;

import java.io.*;
+import java.util.Map;
+import java.util.HashMap;

public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
@@ -76,11 +78,13 @@
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
+ boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
+ HashMap metadata = new HashMap(7);

public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
@@ -109,15 +113,60 @@
wait(10);
}
}
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
+ // look in metadata
+ String description = (String) metadata.get("description");
+ if (description != null)
+ return description;
+ else {
+ if (summary.length() > SUMMARY_LENGTH)
+ summary.setLength(SUMMARY_LENGTH);
+
+ String sum = summary.toString().trim();
+ String tit = getTitle();
+ if (sum.startsWith(tit))
+ return sum.substring(tit.length());
+ else
+ return sum;
+ }
+ }
+
+ public String getAuthor() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("author");
+ }
+
+ public String getKeywords() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("keywords");
+ }

- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.startsWith(tit))
- return sum.substring(tit.length());
- else
- return sum;
+ public Map getMetadata() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return metadata;
}

public Reader getReader() throws IOException {
@@ -144,7 +193,7 @@
}

void addText(String text) throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (inTitle)
title.append(text);
@@ -165,7 +214,7 @@
}

void addSpace() throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (!afterSpace) {
if (inTitle)
@@ -216,23 +265,38 @@
{
Token t1, t2;
boolean inImg = false;
+ boolean inMeta = false;
+ String name = null;
+ String content = null;
}
{
t1=<TagName> {
- inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
- inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
- if (inScript) { // keep track if in <SCRIPT>
+ inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
+ inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track if in <META>
+ if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
+ if (inStyle) { // keep track if in <STYLE>
+ inStyle = !t1.image.equalsIgnoreCase("</style");
+ } else {
+ inStyle = t1.image.equalsIgnoreCase("<style");
+ }
}
(t1=<ArgName>
(<ArgEquals>
- (t2=ArgValue() // save ALT text in IMG tag
+ (t2=ArgValue()
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
- addText("[" + t2.image + "]");
+ addText("[" + t2.image + "]"); // save ALT text in IMG tag
+ if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
+ name = t2.image.toLowerCase(); // save name in META tag
+ if (inMeta && t1.image.equalsIgnoreCase("content") && t2 != null)
+ content = t2.image; // save content in META tag
+ if (inMeta && name != null && content != null)
+ metadata.put(name, content); // save metadata
}
)?
)?

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

Re: HTMLParser [ In reply to ]

otis_gospodnetic at yahoo

Feb 15, 2002, 1:47 PM

Post #2 of 9 (1357 views)

Permalink

While you are at it, perhaps it would be good to add support for add
other META tags, such as "robots", especially since people are working
on adding a web crawler component to Lucene.

Thanks,
Otis

--- Daniel Calvo <dcalvo@task.com.br> wrote:
> Hi,
>
> I was playing with HTMLParser.jj and made some changes you might be
> interested in. What I did was start handling <META> tags (added
> new methods: getAuthor, getKeywords and getMetadata and changed
> getSummary to check if there's any metadata item with
> name=="description"). I'm also filtering out any text inside
> <STYLE>...</STYLE> (like <SCRIPT> is being handled).
> I've performed some tests and I belive I didn't break anything ;-)
>
> The patch is as follows
>
> Best regards,
>
> --Daniel
>
> Index: HTMLParser.jj
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
> retrieving revision 1.1
> diff -u -r1.1 HTMLParser.jj
> --- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
> +++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
> @@ -66,6 +66,8 @@
> package org.apache.lucene.demo.html;
>
> import java.io.*;
> +import java.util.Map;
> +import java.util.HashMap;
>
> public class HTMLParser {
> public static int SUMMARY_LENGTH = 200;
> @@ -76,11 +78,13 @@
> boolean titleComplete = false;
> boolean inTitle = false;
> boolean inScript = false;
> + boolean inStyle = false;
> boolean afterTag = false;
> boolean afterSpace = false;
> String eol = System.getProperty("line.separator");
> PipedReader pipeIn = null;
> PipedWriter pipeOut;
> + HashMap metadata = new HashMap(7);
>
> public HTMLParser(File file) throws FileNotFoundException {
> this(new FileInputStream(file));
> @@ -109,15 +113,60 @@
> wait(10);
> }
> }
> - if (summary.length() > SUMMARY_LENGTH)
> - summary.setLength(SUMMARY_LENGTH);
> + // look in metadata
> + String description = (String) metadata.get("description");
> + if (description != null)
> + return description;
> + else {
> + if (summary.length() > SUMMARY_LENGTH)
> + summary.setLength(SUMMARY_LENGTH);
> +
> + String sum = summary.toString().trim();
> + String tit = getTitle();
> + if (sum.startsWith(tit))
> + return sum.substring(tit.length());
> + else
> + return sum;
> + }
> + }
> +
> + public String getAuthor() throws IOException, InterruptedException
> {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("author");
> + }
> +
> + public String getKeywords() throws IOException,
> InterruptedException {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("keywords");
> + }
>
> - String sum = summary.toString().trim();
> - String tit = getTitle();
> - if (sum.startsWith(tit))
> - return sum.substring(tit.length());
> - else
> - return sum;
> + public Map getMetadata() throws IOException, InterruptedException
> {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return metadata;
> }
>
> public Reader getReader() throws IOException {
> @@ -144,7 +193,7 @@
> }
>
> void addText(String text) throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (inTitle)
> title.append(text);
> @@ -165,7 +214,7 @@
> }
>
> void addSpace() throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (!afterSpace) {
> if (inTitle)
> @@ -216,23 +265,38 @@
> {
> Token t1, t2;
> boolean inImg = false;
> + boolean inMeta = false;
> + String name = null;
> + String content = null;
> }
> {
> t1=<TagName> {
> - inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if
> in <TITLE>
> - inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in
> <IMG>
> - if (inScript) { // keep track if in <SCRIPT>
> + inTitle = t1.image.equalsIgnoreCase("<title"); // keep track
> if in <TITLE>
> + inImg = t1.image.equalsIgnoreCase("<img"); // keep track
> if in <IMG>
> + inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track
> if in <META>
> + if (inScript) { // keep track
> if in <SCRIPT>
> inScript = !t1.image.equalsIgnoreCase("</script");
> } else {
> inScript = t1.image.equalsIgnoreCase("<script");
> }
> + if (inStyle) { // keep track
> if in <STYLE>
> + inStyle = !t1.image.equalsIgnoreCase("</style");
> + } else {
> + inStyle = t1.image.equalsIgnoreCase("<style");
> + }
> }
> (t1=<ArgName>
> (<ArgEquals>
> - (t2=ArgValue() // save ALT text in IMG tag
> + (t2=ArgValue()
> {
> if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
> - addText("[" + t2.image + "]");
> + addText("[" + t2.image + "]"); // save ALT text in
> IMG tag
> + if (inMeta && t1.image.equalsIgnoreCase("name") && t2 !=
> null)
> + name = t2.image.toLowerCase(); // save name in META
> tag
> + if (inMeta && t1.image.equalsIgnoreCase("content") && t2 !=
> null)
> + content = t2.image; // save content in
> META tag
> + if (inMeta && name != null && content != null)
> + metadata.put(name, content); // save metadata
> }
> )?
> )?
>
>
> --
> To unsubscribe, e-mail:
> <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> For additional commands, e-mail:
> <mailto:lucene-dev-help@jakarta.apache.org>
>

__________________________________________________
Do You Yahoo!?
Got something to say? Say it better with Yahoo! Video Mail
http://mail.yahoo.com

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

Re: HTMLParser [ In reply to ]

harwoods at ntlworld

Feb 15, 2002, 3:10 PM

Post #3 of 9 (1348 views)

Permalink

>>While you are at it, perhaps it would be good to add support for add
>>other META tags
I posted that a while back.
Here it is again. See the getMetaTags() method.....

Mark Harwood

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

// HTMLParser.jj

options {
STATIC = false;
OPTIMIZE_TOKEN_MANAGER = true;
file://DEBUG_LOOKAHEAD = true;
file://DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(HTMLParser)

package org.apache.lucene.HTMLParser;

import java.io.*;
import java.util.Properties;

public class HTMLParser {
public static int SUMMARY_LENGTH = 200;

StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags=new Properties();
String currentMetaTag="";
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;

public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}

public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || (length > SUMMARY_LENGTH))
break;
wait(10);
}
}
return title.toString().trim();
}

public Properties getMetaTags() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || (length > SUMMARY_LENGTH))
break;
wait(10);
}
}
return metaTags;
}

public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH)
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);

String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit))
return sum.substring(tit.length());
else
return sum;
}

public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeIn = new PipedReader();
pipeOut = new PipedWriter(pipeIn);

Thread thread = new ParserThread(this);
thread.start(); // start parsing
}

return pipeIn;
}

void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}

void addText(String text) throws IOException {
if (inScript)
return;
if (inMetaTag)
{
metaTags.setProperty(currentMetaTag, text);
return;
}
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !title.equals("")) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}

length += text.length();
pipeOut.write(text);

afterSpace = false;
}

void addSpace() throws IOException {
if (inScript)
return;
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");

String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}

// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}

PARSER_END(HTMLParser)

void HTMLDocument() throws IOException :
{
Token t;
}
{
// try {
( Tag() { afterTag = true; }
| t=Decl() { afterTag = true; }
| CommentTag() { afterTag = true; }
| t=<Word> { addText(t.image); afterTag = false; }
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
| <Space> { addSpace(); afterTag = false; }
)* <EOF>
// } catch (ParseException e) {
// handleException(e);
// }
}

void Tag() throws IOException :
{
Token t1, t2;
boolean inImg = false;
}
{
t1=<TagName> {
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in
<TITLE>
inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in
<META>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
}
(t1=<ArgName>
(<ArgEquals>
(t2=ArgValue() // save ALT text in IMG tag
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");

if(inMetaTag &&
( t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase();
}
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null)
{
addText(t2.image);
}
}
)?
)?
)*
<TagEnd>
}

Token ArgValue() :
{
Token t = null;
}
{
t=<ArgValue> { return t; }
| LOOKAHEAD(2)
<ArgQuote1> <CloseQuote1> { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
| LOOKAHEAD(2)
<ArgQuote2> <CloseQuote2> { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
}

Token Decl() :
{
Token t;
}
{
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
{ return t; }
}

void CommentTag() :
{}
{
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}

TOKEN :
{
< TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag

| < Comment1: "" > : DEFAULT
}

<WithinComment2> TOKEN :
{
< CommentText2: (~[">"])+ >
| < CommentEnd2: ">" > : DEFAULT
}

RE: HTMLParser [ In reply to ]

paulo.gaspar at krankikom

Feb 15, 2002, 7:13 PM

Post #4 of 9 (1349 views)

Permalink

Can the following Xerces based HTML parser be interesting for
your work?

This is just the initial ANNOUNCE but there are further
developments.

Have fun,
Paulo Gaspar

> -----Original Message-----
> From: Andy Clark [mailto:andyc@apache.org]
> Sent: Saturday, February 09, 2002 4:16 AM
> To: general@xml.apache.org
> Cc: xerces-j-dev@xml.apache.org
> Subject: [ANNOUNCE] Xerces HTML Parser
>
>
> For a long time users have asked if Xerces can parse HTML files.
> But since most HTML documents are not well-formed XML documents,
> it is generally not possible to use a conforming XML parser to
> read HTML documents.
>
> However, the Xerces Native Interface (XNI) that is the foundation
> of the Xerces2 implementation defines a framework that allows
> different kinds of parsers to be constructed by connecting a
> pipeline of parser components. Therefore, as long as a component
> can be written that generates the appropriate XNI "events", then
> it can be used to emit SAX events, build DOM trees, or anything
> else that you can think of.
>
> So, as a fun little exercise, I have written a basic HTML parser
> using XNI. It consists of an HTML scanner component that can scan
> HTML files and generate XNI events and a tag balancing component.
> The tag balancer cleans up the events produced by the scanner,
> balancing mismatched tags and adding tags where necessary. And
> it does all of this in a streaming manner to minimize the amount
> of memory required.
>
> Since I wrote the HTML parser as an example of using XNI and
> because the code is considered alpha quality (but it seems to
> work quite well, actually!), I am posting the code with a very
> limited license. Even though it contains the complete source
> code for the HTML parser, the license only allows the user to
> experiment but gives no right to actually use the code in a
> product.
>
> If the source isn't "free" or "open", why release it at all?
> I want to get an idea of what people think of the code first.
> Then, if there's enough interest, I would like to either donate
> the code to the Xerces-J project or make it available elsewhere
> under a true open source license.
>
> So, if you've been looking for a way to parse HTML documents
> please try out the HTML parser and let me know what you think.
> There should be enough information in the documentation to get
> you started. Check out the "NekoHTML" project listed on my
> Apache web site: http://www.apache.org/~andyc/
>
> Have fun!
>
> --
> Andy Clark * andyc@apache.org
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: xerces-j-dev-unsubscribe@xml.apache.org
> For additional commands, e-mail: xerces-j-dev-help@xml.apache.org
>

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

Re: HTMLParser [ In reply to ]

lists at ehatchersolutions

Feb 15, 2002, 8:40 PM

Post #5 of 9 (1348 views)

Permalink

I'm wondering how this compares to JTidy. Anyone know?

How does HTMLParser.jj compare to JTidy's capabilities?

Erik

----- Original Message -----
From: "Paulo Gaspar" <paulo.gaspar@krankikom.de>
To: "Lucene Developers List" <lucene-dev@jakarta.apache.org>
Sent: Friday, February 15, 2002 9:13 PM
Subject: RE: HTMLParser

> Can the following Xerces based HTML parser be interesting for
> your work?
>
> This is just the initial ANNOUNCE but there are further
> developments.
>
>
> Have fun,
> Paulo Gaspar
>
> > -----Original Message-----
> > From: Andy Clark [mailto:andyc@apache.org]
> > Sent: Saturday, February 09, 2002 4:16 AM
> > To: general@xml.apache.org
> > Cc: xerces-j-dev@xml.apache.org
> > Subject: [ANNOUNCE] Xerces HTML Parser
> >
> >
> > For a long time users have asked if Xerces can parse HTML files.
> > But since most HTML documents are not well-formed XML documents,
> > it is generally not possible to use a conforming XML parser to
> > read HTML documents.
> >
> > However, the Xerces Native Interface (XNI) that is the foundation
> > of the Xerces2 implementation defines a framework that allows
> > different kinds of parsers to be constructed by connecting a
> > pipeline of parser components. Therefore, as long as a component
> > can be written that generates the appropriate XNI "events", then
> > it can be used to emit SAX events, build DOM trees, or anything
> > else that you can think of.
> >
> > So, as a fun little exercise, I have written a basic HTML parser
> > using XNI. It consists of an HTML scanner component that can scan
> > HTML files and generate XNI events and a tag balancing component.
> > The tag balancer cleans up the events produced by the scanner,
> > balancing mismatched tags and adding tags where necessary. And
> > it does all of this in a streaming manner to minimize the amount
> > of memory required.
> >
> > Since I wrote the HTML parser as an example of using XNI and
> > because the code is considered alpha quality (but it seems to
> > work quite well, actually!), I am posting the code with a very
> > limited license. Even though it contains the complete source
> > code for the HTML parser, the license only allows the user to
> > experiment but gives no right to actually use the code in a
> > product.
> >
> > If the source isn't "free" or "open", why release it at all?
> > I want to get an idea of what people think of the code first.
> > Then, if there's enough interest, I would like to either donate
> > the code to the Xerces-J project or make it available elsewhere
> > under a true open source license.
> >
> > So, if you've been looking for a way to parse HTML documents
> > please try out the HTML parser and let me know what you think.
> > There should be enough information in the documentation to get
> > you started. Check out the "NekoHTML" project listed on my
> > Apache web site: http://www.apache.org/~andyc/
> >
> > Have fun!
> >
> > --
> > Andy Clark * andyc@apache.org
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: xerces-j-dev-unsubscribe@xml.apache.org
> > For additional commands, e-mail: xerces-j-dev-help@xml.apache.org
> >
>
> --
> To unsubscribe, e-mail:
<mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> For additional commands, e-mail:
<mailto:lucene-dev-help@jakarta.apache.org>
>
>

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

RE: HTMLParser [ In reply to ]

dcalvo at task

Feb 15, 2002, 9:11 PM

Post #6 of 9 (1346 views)

Permalink

> From: harwoods [mailto:harwoods@ntlworld.com]
> >>While you are at it, perhaps it would be good to add support for add
> >>other META tags
> I posted that a while back.
> Here it is again. See the getMetaTags() method.....
>
> Mark Harwood

Hi,

I haven't seen this code before (haven't checked the list archives :-(). I've created a getMetadata() method as well but I forgot to
convert character references, which I'm doing now. Besides that, I'm also collecting URIs (href=...) and making them available via
getURIs() (which can be usefull for people writing crawlers as Otis has mentioned). There's a problem with this method, though.
Since the original parser relies on piped readers/writers and the URI set won't be available till all file has been parsed, the
output generated during parsing (the file contents) must be consumed before one tries to get the URIs, or else the method will block
forever. It took me some time to figure that out but it does make sense.

BTW, is there any special reason to have implemented this parser with pipes? Wouldn't it be easier if the parser was single
threaded? Of course you can get some metadata (title, summary) before the whole file is parsed but at some point you'll have to wait
till the process ends. OTOH, the contents would have to be stored in a String or any other kind of buffer which isn't good either...

Here's the new patch

Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
+++ HTMLParser.jj 16 Feb 2002 03:08:04 -0000
@@ -66,6 +66,7 @@
package org.apache.lucene.demo.html;

import java.io.*;
+import java.util.*;

public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
@@ -74,13 +75,17 @@
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
int length = 0;
boolean titleComplete = false;
+ boolean parseComplete = false;
boolean inTitle = false;
boolean inScript = false;
+ boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
+ HashMap metadata = new HashMap(7);
+ ArrayList uri = new ArrayList(10);

public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
@@ -109,17 +114,77 @@
wait(10);
}
}
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
+ // look in metadata
+ String description = (String) metadata.get("description");
+ if (description != null)
+ return description;
+ else {
+ if (summary.length() > SUMMARY_LENGTH)
+ summary.setLength(SUMMARY_LENGTH);
+
+ String sum = summary.toString().trim();
+ String tit = getTitle();
+ if (sum.startsWith(tit))
+ return sum.substring(tit.length());
+ else
+ return sum;
+ }
+ }
+
+ public String getAuthor() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("author");
+ }
+
+ public String getKeywords() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("keywords");
+ }

- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.startsWith(tit))
- return sum.substring(tit.length());
- else
- return sum;
+ public Map getMetadata() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return metadata;
}

+ public String[] getURIs() throws IOException, InterruptedException{
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (parseComplete)
+ break;
+ wait();
+ }
+ }
+ if (uri.size() == 0)
+ return new String[0];
+ else return (String[]) uri.toArray(new String[0]);
+ }
+
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeIn = new PipedReader();
@@ -144,7 +209,7 @@
}

void addText(String text) throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (inTitle)
title.append(text);
@@ -165,7 +230,7 @@
}

void addSpace() throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (!afterSpace) {
if (inTitle)
@@ -180,6 +245,28 @@
}
}

+ String decode(String txt) {
+ if (txt == null)
+ return txt;
+ StringBuffer buf = new StringBuffer(txt);
+ for (int i=0; i<buf.length(); ++i) {
+ if (buf.charAt(i) == '&') {
+ int j;
+ for (j=i+1; j<buf.length() && buf.charAt(j) != ';'; ++j);
+ String decoded = Entities.decode(buf.substring(i, j+1));
+ buf.replace(i, j+1, decoded);
+ }
+ }
+ return buf.toString();
+ }
+
+ void endParse() {
+ synchronized(this) {
+ parseComplete = true;
+ notifyAll();
+ }
+ }
+
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
@@ -206,7 +293,7 @@
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
| <Space> { addSpace(); afterTag = false; }
- )* <EOF>
+ )* <EOF> { endParse(); }
// } catch (ParseException e) {
// handleException(e);
// }
@@ -216,23 +303,44 @@
{
Token t1, t2;
boolean inImg = false;
+ boolean inMeta = false;
+ boolean inA = false;
+ String name = null;
+ String content = null;
}
{
t1=<TagName> {
- inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
- inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
- if (inScript) { // keep track if in <SCRIPT>
+ inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
+ inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track if in <META>
+ inA = t1.image.equalsIgnoreCase("<a"); // keep track if in <A>
+ if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
+ if (inStyle) { // keep track if in <STYLE>
+ inStyle = !t1.image.equalsIgnoreCase("</style");
+ } else {
+ inStyle = t1.image.equalsIgnoreCase("<style");
+ }
}
(t1=<ArgName>
(<ArgEquals>
- (t2=ArgValue() // save ALT text in IMG tag
+ (t2=ArgValue()
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
- addText("[" + t2.image + "]");
+ addText("[" + t2.image + "]"); // save ALT text in IMG tag
+ if (inMeta) {
+ if (t1.image.equalsIgnoreCase("name") && t2 != null)
+ name = t2.image.toLowerCase(); // save name in META tag
+ if (t1.image.equalsIgnoreCase("content") && t2 != null)
+ content = decode(t2.image); // save content in META tag
+ if (name != null && content != null)
+ metadata.put(name, content.trim()); // save metadata
+ }
+ if (inA && t1.image.equalsIgnoreCase("href") && t2 != null)
+ uri.add(t2.image);
}
)?
)?

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

RE: HTMLParser [ In reply to ]

dcalvo at task

Feb 15, 2002, 9:32 PM

Post #7 of 9 (1347 views)

Permalink

Maybe...I'll have to give it a try first

Anyway, I was playing with Lucene's HTMParser in order to understand a little better how JavaCC works. My real interest is in PDF
and RTF parsers. I've tried Websearch PDF parser but it only worked well with the examples provided. I wasn't able to parse
correctly even PDF files distributed by Adobe. I've also had a lot of trouble with files converted to PDF (probably via dvi2pdf or
something like that). Recently I read on this list (or maybe it was on the users list) that someone else was having trouble with
both Websearch and PJ library parsers.

I've just downloaded Adobe's PDF Specification and later I'll try to see if there's any room for improvement in Websearch code. I
know PDF has various features (compression, cryptography, etc.) that complicate the parsing and I'm not willing to spend much time
doing this but I'll probably try something.

--Daniel

> -----Original Message-----
> From: Paulo Gaspar [mailto:paulo.gaspar@krankikom.de]
> Sent: sexta-feira, 15 de fevereiro de 2002 23:14
> To: Lucene Developers List
> Subject: RE: HTMLParser
>
>
> Can the following Xerces based HTML parser be interesting for
> your work?
>
> This is just the initial ANNOUNCE but there are further
> developments.
>
>
> Have fun,
> Paulo Gaspar
>
> > -----Original Message-----
> > From: Andy Clark [mailto:andyc@apache.org]
> > Sent: Saturday, February 09, 2002 4:16 AM
> > To: general@xml.apache.org
> > Cc: xerces-j-dev@xml.apache.org
> > Subject: [ANNOUNCE] Xerces HTML Parser
> >
> >
> > For a long time users have asked if Xerces can parse HTML files.
> > But since most HTML documents are not well-formed XML documents,
> > it is generally not possible to use a conforming XML parser to
> > read HTML documents.
> >
> > However, the Xerces Native Interface (XNI) that is the foundation
> > of the Xerces2 implementation defines a framework that allows
> > different kinds of parsers to be constructed by connecting a
> > pipeline of parser components. Therefore, as long as a component
> > can be written that generates the appropriate XNI "events", then
> > it can be used to emit SAX events, build DOM trees, or anything
> > else that you can think of.
> >
> > So, as a fun little exercise, I have written a basic HTML parser
> > using XNI. It consists of an HTML scanner component that can scan
> > HTML files and generate XNI events and a tag balancing component.
> > The tag balancer cleans up the events produced by the scanner,
> > balancing mismatched tags and adding tags where necessary. And
> > it does all of this in a streaming manner to minimize the amount
> > of memory required.
> >
> > Since I wrote the HTML parser as an example of using XNI and
> > because the code is considered alpha quality (but it seems to
> > work quite well, actually!), I am posting the code with a very
> > limited license. Even though it contains the complete source
> > code for the HTML parser, the license only allows the user to
> > experiment but gives no right to actually use the code in a
> > product.
> >
> > If the source isn't "free" or "open", why release it at all?
> > I want to get an idea of what people think of the code first.
> > Then, if there's enough interest, I would like to either donate
> > the code to the Xerces-J project or make it available elsewhere
> > under a true open source license.
> >
> > So, if you've been looking for a way to parse HTML documents
> > please try out the HTML parser and let me know what you think.
> > There should be enough information in the documentation to get
> > you started. Check out the "NekoHTML" project listed on my
> > Apache web site: http://www.apache.org/~andyc/
> >
> > Have fun!
> >
> > --
> > Andy Clark * andyc@apache.org
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: xerces-j-dev-unsubscribe@xml.apache.org
> > For additional commands, e-mail: xerces-j-dev-help@xml.apache.org
> >
>
> --
> To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>
>

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

RE: HTMLParser [ In reply to ]

otis_gospodnetic at yahoo

Feb 16, 2002, 8:14 AM

Post #8 of 9 (1345 views)

Permalink

Hm, I thought this place would have a PDF parser, but it does not.
It does seem to have a RTF parser:
http://cobase-www.cs.ucla.edu/pub/javacc/

Perhaps some of these things can be adopted by Lucene, people could
contribute Java classes for interacting with specific parsers, and all
that could then be included in Lucene to work together with those
DocumentHandlers mentioned a few days ago.

Otis

--- Daniel Calvo <dcalvo@task.com.br> wrote:
> Maybe...I'll have to give it a try first
>
> Anyway, I was playing with Lucene's HTMParser in order to understand
> a little better how JavaCC works. My real interest is in PDF
> and RTF parsers. I've tried Websearch PDF parser but it only worked
> well with the examples provided. I wasn't able to parse
> correctly even PDF files distributed by Adobe. I've also had a lot of
> trouble with files converted to PDF (probably via dvi2pdf or
> something like that). Recently I read on this list (or maybe it was
> on the users list) that someone else was having trouble with
> both Websearch and PJ library parsers.
>
> I've just downloaded Adobe's PDF Specification and later I'll try to
> see if there's any room for improvement in Websearch code. I
> know PDF has various features (compression, cryptography, etc.) that
> complicate the parsing and I'm not willing to spend much time
> doing this but I'll probably try something.
>
> --Daniel
>
> > -----Original Message-----
> > From: Paulo Gaspar [mailto:paulo.gaspar@krankikom.de]
> > Sent: sexta-feira, 15 de fevereiro de 2002 23:14
> > To: Lucene Developers List
> > Subject: RE: HTMLParser
> >
> >
> > Can the following Xerces based HTML parser be interesting for
> > your work?
> >
> > This is just the initial ANNOUNCE but there are further
> > developments.
> >
> >
> > Have fun,
> > Paulo Gaspar
> >
> > > -----Original Message-----
> > > From: Andy Clark [mailto:andyc@apache.org]
> > > Sent: Saturday, February 09, 2002 4:16 AM
> > > To: general@xml.apache.org
> > > Cc: xerces-j-dev@xml.apache.org
> > > Subject: [ANNOUNCE] Xerces HTML Parser
> > >
> > >
> > > For a long time users have asked if Xerces can parse HTML files.
> > > But since most HTML documents are not well-formed XML documents,
> > > it is generally not possible to use a conforming XML parser to
> > > read HTML documents.
> > >
> > > However, the Xerces Native Interface (XNI) that is the foundation
> > > of the Xerces2 implementation defines a framework that allows
> > > different kinds of parsers to be constructed by connecting a
> > > pipeline of parser components. Therefore, as long as a component
> > > can be written that generates the appropriate XNI "events", then
> > > it can be used to emit SAX events, build DOM trees, or anything
> > > else that you can think of.
> > >
> > > So, as a fun little exercise, I have written a basic HTML parser
> > > using XNI. It consists of an HTML scanner component that can scan
> > > HTML files and generate XNI events and a tag balancing component.
> > > The tag balancer cleans up the events produced by the scanner,
> > > balancing mismatched tags and adding tags where necessary. And
> > > it does all of this in a streaming manner to minimize the amount
> > > of memory required.
> > >
> > > Since I wrote the HTML parser as an example of using XNI and
> > > because the code is considered alpha quality (but it seems to
> > > work quite well, actually!), I am posting the code with a very
> > > limited license. Even though it contains the complete source
> > > code for the HTML parser, the license only allows the user to
> > > experiment but gives no right to actually use the code in a
> > > product.
> > >
> > > If the source isn't "free" or "open", why release it at all?
> > > I want to get an idea of what people think of the code first.
> > > Then, if there's enough interest, I would like to either donate
> > > the code to the Xerces-J project or make it available elsewhere
> > > under a true open source license.
> > >
> > > So, if you've been looking for a way to parse HTML documents
> > > please try out the HTML parser and let me know what you think.
> > > There should be enough information in the documentation to get
> > > you started. Check out the "NekoHTML" project listed on my
> > > Apache web site: http://www.apache.org/~andyc/
> > >
> > > Have fun!
> > >
> > > --
> > > Andy Clark * andyc@apache.org
> > >
> > >
> ---------------------------------------------------------------------
> > > To unsubscribe, e-mail: xerces-j-dev-unsubscribe@xml.apache.org
> > > For additional commands, e-mail: xerces-j-dev-help@xml.apache.org
> > >
> >
> > --
> > To unsubscribe, e-mail:
> <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> > For additional commands, e-mail:
> <mailto:lucene-dev-help@jakarta.apache.org>
> >
>
>
> --
> To unsubscribe, e-mail:
> <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> For additional commands, e-mail:
> <mailto:lucene-dev-help@jakarta.apache.org>
>

__________________________________________________
Do You Yahoo!?
Yahoo! Sports - Coverage of the 2002 Olympic Games
http://sports.yahoo.com

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

RE: HTMLParser [ In reply to ]

Stephan.Strittmatter.ext at kst

Feb 19, 2002, 1:05 AM

Post #9 of 9 (1346 views)

Permalink

Hi,

I have some jsp-Pages which are indexed locally with some "<% ... %>"-Tags
in it. I am not famillar within JavaCC. How have I to filter this
jsp-parts out of the page? Could you add this feature also. This would
be great for me!

Regards,

Stephan

> -----Original Message-----
> From: Daniel Calvo [mailto:dcalvo@task.com.br]
> Sent: Friday, February 15, 2002 10:42 PM
> To: Lucene Developers List
> Subject: HTMLParser
>
>
> Hi,
>
> I was playing with HTMLParser.jj and made some changes you
> might be interested in. What I did was start handling <META>
> tags (added
> new methods: getAuthor, getKeywords and getMetadata and
> changed getSummary to check if there's any metadata item with
> name=="description"). I'm also filtering out any text inside
> <STYLE>...</STYLE> (like <SCRIPT> is being handled).
> I've performed some tests and I belive I didn't break anything ;-)
>
> The patch is as follows
>
> Best regards,
>
> --Daniel
>
> Index: HTMLParser.jj
> ===================================================================
> RCS file:
> /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo
/html/HTMLParser.jj,v
> retrieving revision 1.1
> diff -u -r1.1 HTMLParser.jj
> --- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
> +++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
> @@ -66,6 +66,8 @@
> package org.apache.lucene.demo.html;
>
> import java.io.*;
> +import java.util.Map;
> +import java.util.HashMap;
>
> public class HTMLParser {
> public static int SUMMARY_LENGTH = 200;
> @@ -76,11 +78,13 @@
> boolean titleComplete = false;
> boolean inTitle = false;
> boolean inScript = false;
> + boolean inStyle = false;
> boolean afterTag = false;
> boolean afterSpace = false;
> String eol = System.getProperty("line.separator");
> PipedReader pipeIn = null;
> PipedWriter pipeOut;
> + HashMap metadata = new HashMap(7);
>
> public HTMLParser(File file) throws FileNotFoundException {
> this(new FileInputStream(file));
> @@ -109,15 +113,60 @@
> wait(10);
> }
> }
> - if (summary.length() > SUMMARY_LENGTH)
> - summary.setLength(SUMMARY_LENGTH);
> + // look in metadata
> + String description = (String) metadata.get("description");
> + if (description != null)
> + return description;
> + else {
> + if (summary.length() > SUMMARY_LENGTH)
> + summary.setLength(SUMMARY_LENGTH);
> +
> + String sum = summary.toString().trim();
> + String tit = getTitle();
> + if (sum.startsWith(tit))
> + return sum.substring(tit.length());
> + else
> + return sum;
> + }
> + }
> +
> + public String getAuthor() throws IOException,
> InterruptedException {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("author");
> + }
> +
> + public String getKeywords() throws IOException,
> InterruptedException {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("keywords");
> + }
>
> - String sum = summary.toString().trim();
> - String tit = getTitle();
> - if (sum.startsWith(tit))
> - return sum.substring(tit.length());
> - else
> - return sum;
> + public Map getMetadata() throws IOException, InterruptedException {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return metadata;
> }
>
> public Reader getReader() throws IOException {
> @@ -144,7 +193,7 @@
> }
>
> void addText(String text) throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (inTitle)
> title.append(text);
> @@ -165,7 +214,7 @@
> }
>
> void addSpace() throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (!afterSpace) {
> if (inTitle)
> @@ -216,23 +265,38 @@
> {
> Token t1, t2;
> boolean inImg = false;
> + boolean inMeta = false;
> + String name = null;
> + String content = null;
> }
> {
> t1=<TagName> {
> - inTitle = t1.image.equalsIgnoreCase("<title"); // keep
> track if in <TITLE>
> - inImg = t1.image.equalsIgnoreCase("<img"); //
> keep track if in <IMG>
> - if (inScript) { // keep track
> if in <SCRIPT>
> + inTitle = t1.image.equalsIgnoreCase("<title"); //
> keep track if in <TITLE>
> + inImg = t1.image.equalsIgnoreCase("<img"); //
> keep track if in <IMG>
> + inMeta = t1.image.equalsIgnoreCase("<meta"); //
> keep track if in <META>
> + if (inScript) { //
> keep track if in <SCRIPT>
> inScript = !t1.image.equalsIgnoreCase("</script");
> } else {
> inScript = t1.image.equalsIgnoreCase("<script");
> }
> + if (inStyle) { //
> keep track if in <STYLE>
> + inStyle = !t1.image.equalsIgnoreCase("</style");
> + } else {
> + inStyle = t1.image.equalsIgnoreCase("<style");
> + }
> }
> (t1=<ArgName>
> (<ArgEquals>
> - (t2=ArgValue() // save ALT
> text in IMG tag
> + (t2=ArgValue()
> {
> if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
> - addText("[" + t2.image + "]");
> + addText("[" + t2.image + "]"); // save ALT
> text in IMG tag
> + if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
> + name = t2.image.toLowerCase(); // save name
> in META tag
> + if (inMeta && t1.image.equalsIgnoreCase("content") &&
> t2 != null)
> + content = t2.image; // save
> content in META tag
> + if (inMeta && name != null && content != null)
> + metadata.put(name, content); // save metadata
> }
> )?
> )?
>
>
> --
> To unsubscribe, e-mail:
<mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>