Mailing List Archive

cvs commit: jakarta-lucene/src/demo/org/apache/lucene/demo/html HTMLParser.jj
otis 2002/06/29 15:08:27

Modified: src/demo/org/apache/lucene/demo/html HTMLParser.jj
Log:
- Improved HTML parser that allows one to get HTML document's meta tags' values.
Submitted by: Mark Harwood
Reviewed by: otis

Revision Changes Path
1.2 +48 -5 jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj

Index: HTMLParser.jj
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
+++ HTMLParser.jj 29 Jun 2002 22:08:26 -0000 1.2
@@ -66,15 +66,20 @@
package org.apache.lucene.demo.html;

import java.io.*;
+import java.util.Properties;

public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
-
+
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
+ Properties metaTags=new Properties();
+ String currentMetaTag="";
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
+ boolean inMetaTag = false;
+ boolean inStyle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
@@ -99,6 +104,21 @@
return title.toString().trim();
}

+ public Properties getMetaTags() throws IOException,
+InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (titleComplete || (length > SUMMARY_LENGTH))
+ break;
+ wait(10);
+ }
+ }
+ return metaTags;
+ }
+
+
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
@@ -124,7 +144,7 @@
if (pipeIn == null) {
pipeIn = new PipedReader();
pipeOut = new PipedWriter(pipeIn);
-
+
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
@@ -146,6 +166,13 @@
void addText(String text) throws IOException {
if (inScript)
return;
+ if (inStyle)
+ return;
+ if (inMetaTag)
+ {
+ metaTags.setProperty(currentMetaTag, text);
+ return;
+ }
if (inTitle)
title.append(text);
else {
@@ -163,7 +190,7 @@

afterSpace = false;
}
-
+
void addSpace() throws IOException {
if (inScript)
return;
@@ -172,7 +199,7 @@
title.append(" ");
else
addToSummary(" ");
-
+
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
@@ -220,6 +247,8 @@
{
t1=<TagName> {
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
+ inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
@@ -233,6 +262,20 @@
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
+
+ if(inMetaTag &&
+ ( t1.image.equalsIgnoreCase("name") ||
+ t1.image.equalsIgnoreCase("HTTP-EQUIV")
+ )
+ && t2 != null)
+ {
+ currentMetaTag=t2.image.toLowerCase();
+ }
+ if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
+null)
+ {
+ addText(t2.image);
+ }
}
)?
)?
@@ -272,7 +315,7 @@
|
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
-
+

TOKEN :
{




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>