Hi,
I was playing with HTMLParser.jj and made some changes you might be interested in. What I did was start handling <META> tags (added
new methods: getAuthor, getKeywords and getMetadata and changed getSummary to check if there's any metadata item with
name=="description"). I'm also filtering out any text inside <STYLE>...</STYLE> (like <SCRIPT> is being handled).
I've performed some tests and I belive I didn't break anything ;-)
The patch is as follows
Best regards,
--Daniel
Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
+++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
@@ -66,6 +66,8 @@
package org.apache.lucene.demo.html;
import java.io.*;
+import java.util.Map;
+import java.util.HashMap;
public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
@@ -76,11 +78,13 @@
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
+ boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
+ HashMap metadata = new HashMap(7);
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
@@ -109,15 +113,60 @@
wait(10);
}
}
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
+ // look in metadata
+ String description = (String) metadata.get("description");
+ if (description != null)
+ return description;
+ else {
+ if (summary.length() > SUMMARY_LENGTH)
+ summary.setLength(SUMMARY_LENGTH);
+
+ String sum = summary.toString().trim();
+ String tit = getTitle();
+ if (sum.startsWith(tit))
+ return sum.substring(tit.length());
+ else
+ return sum;
+ }
+ }
+
+ public String getAuthor() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("author");
+ }
+
+ public String getKeywords() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("keywords");
+ }
- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.startsWith(tit))
- return sum.substring(tit.length());
- else
- return sum;
+ public Map getMetadata() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return metadata;
}
public Reader getReader() throws IOException {
@@ -144,7 +193,7 @@
}
void addText(String text) throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (inTitle)
title.append(text);
@@ -165,7 +214,7 @@
}
void addSpace() throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (!afterSpace) {
if (inTitle)
@@ -216,23 +265,38 @@
{
Token t1, t2;
boolean inImg = false;
+ boolean inMeta = false;
+ String name = null;
+ String content = null;
}
{
t1=<TagName> {
- inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
- inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
- if (inScript) { // keep track if in <SCRIPT>
+ inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
+ inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track if in <META>
+ if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
+ if (inStyle) { // keep track if in <STYLE>
+ inStyle = !t1.image.equalsIgnoreCase("</style");
+ } else {
+ inStyle = t1.image.equalsIgnoreCase("<style");
+ }
}
(t1=<ArgName>
(<ArgEquals>
- (t2=ArgValue() // save ALT text in IMG tag
+ (t2=ArgValue()
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
- addText("[" + t2.image + "]");
+ addText("[" + t2.image + "]"); // save ALT text in IMG tag
+ if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
+ name = t2.image.toLowerCase(); // save name in META tag
+ if (inMeta && t1.image.equalsIgnoreCase("content") && t2 != null)
+ content = t2.image; // save content in META tag
+ if (inMeta && name != null && content != null)
+ metadata.put(name, content); // save metadata
}
)?
)?
--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>
I was playing with HTMLParser.jj and made some changes you might be interested in. What I did was start handling <META> tags (added
new methods: getAuthor, getKeywords and getMetadata and changed getSummary to check if there's any metadata item with
name=="description"). I'm also filtering out any text inside <STYLE>...</STYLE> (like <SCRIPT> is being handled).
I've performed some tests and I belive I didn't break anything ;-)
The patch is as follows
Best regards,
--Daniel
Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
+++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
@@ -66,6 +66,8 @@
package org.apache.lucene.demo.html;
import java.io.*;
+import java.util.Map;
+import java.util.HashMap;
public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
@@ -76,11 +78,13 @@
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
+ boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
+ HashMap metadata = new HashMap(7);
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
@@ -109,15 +113,60 @@
wait(10);
}
}
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
+ // look in metadata
+ String description = (String) metadata.get("description");
+ if (description != null)
+ return description;
+ else {
+ if (summary.length() > SUMMARY_LENGTH)
+ summary.setLength(SUMMARY_LENGTH);
+
+ String sum = summary.toString().trim();
+ String tit = getTitle();
+ if (sum.startsWith(tit))
+ return sum.substring(tit.length());
+ else
+ return sum;
+ }
+ }
+
+ public String getAuthor() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("author");
+ }
+
+ public String getKeywords() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return (String)metadata.get("keywords");
+ }
- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.startsWith(tit))
- return sum.substring(tit.length());
- else
- return sum;
+ public Map getMetadata() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() > 0) // assume that all metadata
+ break; // has already been collected
+ wait(10);
+ }
+ }
+ return metadata;
}
public Reader getReader() throws IOException {
@@ -144,7 +193,7 @@
}
void addText(String text) throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (inTitle)
title.append(text);
@@ -165,7 +214,7 @@
}
void addSpace() throws IOException {
- if (inScript)
+ if (inScript || inStyle)
return;
if (!afterSpace) {
if (inTitle)
@@ -216,23 +265,38 @@
{
Token t1, t2;
boolean inImg = false;
+ boolean inMeta = false;
+ String name = null;
+ String content = null;
}
{
t1=<TagName> {
- inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
- inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
- if (inScript) { // keep track if in <SCRIPT>
+ inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
+ inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track if in <META>
+ if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
+ if (inStyle) { // keep track if in <STYLE>
+ inStyle = !t1.image.equalsIgnoreCase("</style");
+ } else {
+ inStyle = t1.image.equalsIgnoreCase("<style");
+ }
}
(t1=<ArgName>
(<ArgEquals>
- (t2=ArgValue() // save ALT text in IMG tag
+ (t2=ArgValue()
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
- addText("[" + t2.image + "]");
+ addText("[" + t2.image + "]"); // save ALT text in IMG tag
+ if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
+ name = t2.image.toLowerCase(); // save name in META tag
+ if (inMeta && t1.image.equalsIgnoreCase("content") && t2 != null)
+ content = t2.image; // save content in META tag
+ if (inMeta && name != null && content != null)
+ metadata.put(name, content); // save metadata
}
)?
)?
--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>