Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherTask.java
cmarschner 2002/06/17 06:58:33

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
FetcherTask.java
Log:
removed bug: doc is saved under new URL if 301/302 error occured

Revision Changes Path
1.4 +24 -10 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java

Index: FetcherTask.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FetcherTask.java 1 Jun 2002 18:55:15 -0000 1.3
+++ FetcherTask.java 17 Jun 2002 13:58:33 -0000 1.4
@@ -65,7 +65,6 @@

import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLogger;
-import de.lanlab.larm.net.HttpTimeoutFactory;
import HTTPClient.*;
import java.net.*;
import java.io.*;
@@ -73,6 +72,7 @@
import java.text.*;
import de.lanlab.larm.parser.Tokenizer;
import de.lanlab.larm.parser.LinkHandler;
+import de.lanlab.larm.net.*;

/**
* this class gets the documents from the web. It connects to the server given
@@ -266,8 +266,11 @@
return actURLMessage.getUrl();
}

- SimpleLogger log;
- SimpleLogger errorLog;
+ volatile SimpleLogger log;
+
+ volatile SimpleLogger errorLog;
+
+ volatile HostManager hostManager;
//private long startTime;

/**
@@ -282,7 +285,9 @@
taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy

log = thread.getLog();
- HostManager hm = ((FetcherThread)thread).getHostManager();
+ hostManager = ((FetcherThread)thread).getHostManager();
+
+ //HostManager hm = ((FetcherThread)thread).getHostManager();

errorLog = thread.getErrorLog();

@@ -292,11 +297,11 @@
log.log("start");
base = contextUrl = actURLMessage.getUrl();
String urlString = actURLMessage.getURLString();
- String host = contextUrl.getHost();
+ String host = contextUrl.getHost().toLowerCase();
int hostPos = urlString.indexOf(host);
int hostLen = host.length();

- HostInfo hi = hm.getHostInfo(host); // get and create
+ HostInfo hi = hostManager.getHostInfo(host); // get and create

if(!hi.isHealthy())
{
@@ -344,6 +349,7 @@
byte[] fullBuffer = null;
String contentType = "";
int contentLength = 0;
+ Date date = null;

if (statusCode != 404 && statusCode != 403)
{
@@ -351,6 +357,8 @@
taskState.setState(FT_READING, ipURL);
contentType = response.getHeader("Content-Type");
String length = response.getHeader("Content-Length");
+ date = response.getHeaderAsDate("Last-Modified");
+
if (length != null)
{
contentLength = Integer.parseInt(length);
@@ -358,6 +366,12 @@
log.log("reading");

fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
+ base = contextUrl = response.getEffectiveURI().toURL();
+ // may have changed after a 30x result code
+ // to do: record the link between original and effective URL
+ // like this the effectiveURL may be crawled twice
+
+
if (fullBuffer != null)
{
contentLength = fullBuffer.length;
@@ -403,7 +417,7 @@
taskState.setState(FT_STORING, ipURL);
linkStorage.storeLinks(foundUrls);
//messageHandler.putMessages(foundUrls);
- docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title));
+ docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title, hostManager));
log.log("stored");
}
}
@@ -576,9 +590,9 @@
url = new URL(base, link);
}

- URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor);
+ URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor, hostManager);

- String urlString = urlMessage.getURLString();
+ //String urlString = urlMessage.getURLString();

foundUrls.add(urlMessage);
//messageHandler.putMessage(new actURLMessage(url)); // put them in the very end




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>