Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util WebDocument.java
cmarschner 2002/06/17 07:16:13

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/util
WebDocument.java
Log:
added URLNormalizer

Revision Changes Path
1.4 +3 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java

Index: WebDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- WebDocument.java 1 Jun 2002 18:55:16 -0000 1.3
+++ WebDocument.java 17 Jun 2002 14:16:12 -0000 1.4
@@ -57,6 +57,7 @@

import java.net.URL;
import de.lanlab.larm.fetcher.URLMessage;
+import de.lanlab.larm.net.HostManager;

/**
* a web document of whatever type. generated by a fetcher task
@@ -69,9 +70,9 @@
protected int size;
protected String title;

- public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title)
+ public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title, HostManager hm)
{
- super(url, referer, false, null);
+ super(url, referer, false, null, hm);
this.url = url;
this.mimeType = mimeType;
this.document = document;




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>
cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util WebDocument.java [ In reply to ]
cmarschner 2002/06/17 17:46:35

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/util
WebDocument.java
Log:
changed web doc. to field/value pairs

Revision Changes Path
1.5 +47 -5 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java

Index: WebDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- WebDocument.java 17 Jun 2002 14:16:12 -0000 1.4
+++ WebDocument.java 18 Jun 2002 00:46:35 -0000 1.5
@@ -56,6 +56,9 @@


import java.net.URL;
+import java.util.HashMap;
+import java.util.Date;
+import java.util.Set;
import de.lanlab.larm.fetcher.URLMessage;
import de.lanlab.larm.net.HostManager;

@@ -65,20 +68,55 @@
public class WebDocument extends URLMessage
{
protected String mimeType;
- protected byte[] document;
+ // protected byte[] document;
protected int resultCode;
protected int size;
protected String title;
+ protected Date lastModified;
+ HashMap fields;

- public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title, HostManager hm)
+ public WebDocument(URL url, String mimeType, int resultCode, URL referer, int size, String title, Date lastModified, HostManager hm)
{
super(url, referer, false, null, hm);
this.url = url;
this.mimeType = mimeType;
- this.document = document;
+ //this.document = document;
this.resultCode = resultCode;
this.size = size;
this.title = title;
+ this.lastModified = lastModified;
+ this.fields = new HashMap(7); // expect ~4 fields
+ }
+
+ public Set getFieldNames()
+ {
+ return fields.keySet();
+ }
+
+ public Object getField(String name)
+ {
+ return fields.get(name);
+ }
+
+ public void addField(String name, Object value)
+ {
+ fields.put(name, value);
+ }
+
+ public void removeField(String name)
+ {
+ fields.remove(name);
+ }
+
+ public int getNumFields()
+ {
+ return fields.size();
+ }
+
+
+ public Date getLastModified()
+ {
+ return lastModified;
}

public String getTitle()
@@ -101,11 +139,13 @@
this.size = size;
}

-
+/*
public void setDocument(byte[] document)
{
this.document = document;
}
+*/
+
public int getResultCode()
{
return resultCode;
@@ -116,10 +156,12 @@
this.resultCode = resultCode;
}

+/*
public byte[] getDocumentBytes()
{
return this.document;
}
+*/

public void setUrl(URL url)
{
@@ -142,7 +184,7 @@
this.resultCode + "\t" +
this.mimeType + "\t" +
this.size + "\t" +
- "\"" + this.title.replace('\t',' ').replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\"";
+ "\"" + this.title.replace('\t',' ').replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\"\t" + (this.lastModified != null ? java.text.DateFormat.getDateTimeInstance(java.text.DateFormat.SHORT, java.text.DateFormat.SHORT).format(this.lastModified) : "");
}






--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>