Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherTask.java
cmarschner 2002/10/22 08:02:43

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
FetcherTask.java
Log:
better handling of status codes

Revision Changes Path
1.6 +330 -157 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java

Index: FetcherTask.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- FetcherTask.java 18 Jun 2002 00:45:10 -0000 1.5
+++ FetcherTask.java 22 Oct 2002 15:02:43 -0000 1.6
@@ -1,57 +1,57 @@
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
+/*
+ * ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
*/
-
package de.lanlab.larm.fetcher;

import java.net.URL;
@@ -78,17 +78,20 @@
* this class gets the documents from the web. It connects to the server given
* by the IP address in the URLMessage, gets the document, and forwards it to
* the storage. If it's an HTML document, it will be parsed and all links will
- * be put into the message handler again.
- *
- * stores contents of the files in field "contents"
+ * be put into the message handler again. stores contents of the files in field
+ * "contents"
*
* @author Clemens Marschner
- * @version $Id$
+ * @created 28. Juni 2002
+ * @version $Id$
*/
public class FetcherTask
implements InterruptableTask, LinkHandler, Serializable
{

+ /**
+ * Description of the Field
+ */
protected volatile boolean isInterrupted = false;

/**
@@ -109,8 +112,7 @@
private volatile URL base;

/**
- * the URL of the docuzment
- * only valid within a doTask call
+ * the URL of the docuzment only valid within a doTask call
*/
private volatile URL contextUrl;

@@ -120,8 +122,7 @@
protected static volatile MessageHandler messageHandler;

/**
- * actual number of bytes read
- * only valid within a doTask call
+ * actual number of bytes read only valid within a doTask call
*/
private volatile long bytesRead = 0;

@@ -135,30 +136,61 @@
*/
private static volatile LinkStorage linkStorage;

-
-
/**
* task state IDs. comparisons will be done by their references, so always
* use the IDs
*/
public final static String FT_IDLE = "idle";
+ /**
+ * Description of the Field
+ */
public final static String FT_STARTED = "started";
+ /**
+ * Description of the Field
+ */
public final static String FT_OPENCONNECTION = "opening connection";
+ /**
+ * Description of the Field
+ */
public final static String FT_CONNECTING = "connecting";
+ /**
+ * Description of the Field
+ */
public final static String FT_GETTING = "getting";
+ /**
+ * Description of the Field
+ */
public final static String FT_READING = "reading";
+ /**
+ * Description of the Field
+ */
public final static String FT_SCANNING = "scanning";
+ /**
+ * Description of the Field
+ */
public final static String FT_STORING = "storing";
+ /**
+ * Description of the Field
+ */
public final static String FT_READY = "ready";
+ /**
+ * Description of the Field
+ */
public final static String FT_CLOSING = "closing";
+ /**
+ * Description of the Field
+ */
public final static String FT_EXCEPTION = "exception";
+ /**
+ * Description of the Field
+ */
public final static String FT_INTERRUPTED = "interrupted";

private volatile State taskState = new State(FT_IDLE);

/**
- * the URLs found will be stored and only added to the message handler in the very
- * end, to avoid too many synchronizations
+ * the URLs found will be stored and only added to the message handler in
+ * the very end, to avoid too many synchronizations
*/
private volatile LinkedList foundUrls;

@@ -172,17 +204,6 @@
*/
private volatile String title;

- /**
- * headers for HTTPClient
- */
- private static volatile NVPair headers[] = new NVPair[1];
-
- static
- {
- headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
-
- }
-

/**
* Gets a copy of the current taskState
@@ -198,7 +219,7 @@
/**
* Constructor for the FetcherTask object
*
- * @param urlMessage Description of the Parameter
+ * @param urlMessage Description of the Parameter
*/
public FetcherTask(URLMessage urlMessage)
{
@@ -227,6 +248,7 @@
FetcherTask.docStorage = docStorage;
}

+
/**
* Sets the document linkStorage
*
@@ -268,27 +290,54 @@
return actURLMessage.getUrl();
}

+
volatile SimpleLogger log;

volatile SimpleLogger errorLog;

volatile HostManager hostManager;
+ volatile HostResolver hostResolver;
+
//private long startTime;

/**
* this will be called by the fetcher thread and will do all the work
*
- * @TODO probably split this up into different processing steps
* @param thread Description of the Parameter
+ * @TODO probably split this up into different processing steps
*/
public void run(ServerThread thread)
{

- taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
+
+ taskState.setState(FT_STARTED);
+ // state information is always set to make the thread monitor happy

log = thread.getLog();
- hostManager = ((FetcherThread)thread).getHostManager();
+ hostManager = ((FetcherThread) thread).getHostManager();
+ hostResolver = hostManager.getHostResolver();
+ base = contextUrl = actURLMessage.getUrl();
+ String urlString = actURLMessage.getURLString();
+ String host = contextUrl.getHost().toLowerCase();
+ HostInfo hi = hostManager.getHostInfo(host);
+// System.out.println("FetcherTask with " + urlString + " started");
+ if(actURLMessage.linkType == URLMessage.LINKTYPE_REDIRECT)
+ {
+ taskState.setState(FT_READY, null);
+ hi.releaseLock();
+ return; // we've already crawled that (see below)
+ }

+ NVPair[] headers = ((FetcherThread) thread).getDefaultHeaders();
+ int numHeaders = ((FetcherThread) thread).getUsedDefaultHeaders();
+ boolean isIncremental = false;
+ if (actURLMessage instanceof WebDocument)
+ {
+ // this is an incremental crawl where we only have to check whether the doc crawled
+ // is newer
+ isIncremental = true;
+ headers[numHeaders] = new NVPair("If-Modified-Since", HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
+ }
//HostManager hm = ((FetcherThread)thread).getHostManager();

errorLog = thread.getErrorLog();
@@ -297,21 +346,19 @@
int threadNr = ((FetcherThread) thread).getThreadNumber();

log.log("start");
- base = contextUrl = actURLMessage.getUrl();
- String urlString = actURLMessage.getURLString();
- String host = contextUrl.getHost().toLowerCase();
int hostPos = urlString.indexOf(host);
int hostLen = host.length();

- HostInfo hi = hostManager.getHostInfo(host); // get and create
+ // get and create

- if(!hi.isHealthy())
+ if (!hi.isHealthy())
{
// we make this check as late as possible to get the most current information
log.log("Bad Host: " + contextUrl + "; returning");
- System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
+// System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());

taskState.setState(FT_READY, null);
+ hi.releaseLock();
return;
}

@@ -319,14 +366,13 @@

HTTPConnection conn = null;

- title = "*untitled*";
+ title = "";

int size = 1;

InputStream in = null;
bytesRead = 0;

-
try
{

@@ -339,6 +385,7 @@
conn = new HTTPConnection(host);

conn.setDefaultTimeout(75000);
+
// 75 s
conn.setDefaultAllowUserInteraction(false);

@@ -353,67 +400,176 @@
int contentLength = 0;
Date date = null;

- if (statusCode != 404 && statusCode != 403)
+ if (isIncremental)
{
- // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
- taskState.setState(FT_READING, ipURL);
- contentType = response.getHeader("Content-Type");
- String length = response.getHeader("Content-Length");
- date = response.getHeaderAsDate("Last-Modified");
-
- if (length != null)
- {
- contentLength = Integer.parseInt(length);
- }
- log.log("reading");
-
- fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
- base = contextUrl = response.getEffectiveURI().toURL();
- // may have changed after a 30x result code
- // to do: record the link between original and effective URL
- // like this the effectiveURL may be crawled twice
-
-
- if (fullBuffer != null)
- {
- contentLength = fullBuffer.length;
- this.bytesRead += contentLength;
- }
+ // experimental
+ System.out.println("ftask: if modified since: " + HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
}
- //conn.stop(); // close connection. todo: Do some caching...

+ URL realURL;

- /*
- * conn.disconnect();
- */
- if (isInterrupted)
+ switch (statusCode)
{
- System.out.println("FetcherTask: interrupted while reading. File truncated");
- log.log("interrupted while reading. File truncated");
- }
- else
- {
- if (fullBuffer != null)
- {
- taskState.setState(FT_SCANNING, ipURL);
+ case 404: // file not found
+ case 403: // access forbidden

- log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
+ // if this is an incremental crawl, remove the doc from the repository
+ if (isIncremental)
+ {
+ WebDocument d = (WebDocument) actURLMessage;
+ d.setResultCode(statusCode);
+ // the repository will remove the doc if this statuscode is matched
+ docStorage.store(d);
+ }
+ // otherwise, do nothing
+ // Todo: we could add an error marker to the referal link
+ break;
+ case 304:
+ // not modified
+ System.out.println("ftask: -> not modified");
+ // "not modified since"
+ taskState.setState(FT_STORING, ipURL);
+ // let the repository take care of the links
+ // it will determine that this is the old document (because it already
+ // has a docId), and will put back the links associated with it
+ try
+ {
+ WebDocument doc = (WebDocument) this.actURLMessage;
+ doc.setModified(false);
+ docStorage.store(doc);
+ this.bytesRead += doc.getSize();
+ }
+ catch (ClassCastException e)
+ {
+ System.out.println("error while casting to WebDoc: " + actURLMessage.getInfo());
+ }
+ break;
+ case 301: // moved permanently
+ case 302: // moved temporarily
+ case 303: // see other
+ case 307: // temporary redirect
+ /*
+ * this is a redirect. save it as a link and return.
+ * note that we could read the doc from the open connection here, but this could mean
+ * the filters were useless
+ */
+ realURL = response.getEffectiveURI().toURL();
+ foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
+ linkStorage.storeLinks(foundUrls);
+ break;
+ default:
+ // this can be a 30x code that was resolved by the HTTPClient and is passed to us as 200
+ // we could turn this off and do it ourselves. But then we'd have to take care that
+ // we don't get into an endless redirection loop -> i.e. extend URLMessage by a counter
+ // at the moment we add the real URL to the message queue and mark it as a REDIRECT link
+ // that way it is added to the visited filter. Then we take care that we don't crawl it again
+
+ // the other possibility is that we receive a "Location:" header along with a 200 status code
+ // I have experienced that HTTPClient has an error with parsing this, so we do it ourselves
+ //String location = response.getHeader("Location");
+ realURL = response.getEffectiveURI().toURL();

- if (contentType.startsWith("text/html"))
+ /*if(location != null)
+ {
+ //System.out.println("interesting: location header with url " + location);
+ foundUrls.add(new URLMessage(new URL(location), contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostManager));
+ this.base = this.contextUrl = location;
+ }
+ else*/
+ if(!(realURL.equals(contextUrl)))
{
+ //System.out.println("interesting: redirect with url " + realURL + " -context: " + contextUrl);
+ foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
+ this.base = this.contextUrl = realURL;
+ //System.out.println(response);

- // ouch. I haven't found a better solution yet. just slower ones.
- char[] fullCharBuffer = new char[contentLength];
- new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
- Tokenizer tok = new Tokenizer();
- tok.setLinkHandler(this);
- tok.parse(new SimpleCharArrayReader(fullCharBuffer));
+ }

- taskState.setState(FT_STORING, ipURL);
- linkStorage.storeLinks(foundUrls);
- WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
- d.addField("content", fullCharBuffer);
- docStorage.store(d);
+
+
+
+ if (isIncremental)
+ {
+ // experimental
+ System.out.println("ftask: -> was modified at " + response.getHeaderAsDate("Last-Modified"));
+ }
+ // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
+ taskState.setState(FT_READING, ipURL);
+ contentType = response.getHeader("Content-Type");
+ String length = response.getHeader("Content-Length");
+ date = response.getHeaderAsDate("Last-Modified");
+
+ if (length != null)
+ {
+ contentLength = Integer.parseInt(length);
+ }
+ log.log("reading");
+ realURL = response.getEffectiveURI().toURL();
+ if (contentType != null && contentType.startsWith("text/html"))
+ {
+ fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE);
+ hi.releaseLock();
+ // max. 2 MB
+ if (fullBuffer != null)
+ {
+ contentLength = fullBuffer.length;
+ this.bytesRead += contentLength;
+ }
+
+ /*
+ * conn.disconnect();
+ */
+ if (isInterrupted)
+ {
+ System.out.println("FetcherTask: interrupted while reading. File truncated");
+ log.log("interrupted while reading. File truncated");
+ }
+ else
+ {
+ if (fullBuffer != null)
+ {
+ taskState.setState(FT_SCANNING, ipURL);
+
+ log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
+
+ // convert the bytes to Java characters
+ // ouch. I haven't found a better solution yet. just slower ones.
+ // remember: for better runtime performance avoid decorators, since they
+ // multiply function calls
+ char[] fullCharBuffer = new char[contentLength];
+ new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
+ Tokenizer tok = new Tokenizer();
+ tok.setLinkHandler(this);
+ tok.parse(new SimpleCharArrayReader(fullCharBuffer));
+
+ taskState.setState(FT_STORING, ipURL);
+ linkStorage.storeLinks(foundUrls);
+ WebDocument d;
+ if (isIncremental)
+ {
+ d = ((WebDocument) this.actURLMessage);
+ d.setModified(true);
+ // file is new or newer
+ d.setUrl(contextUrl);
+ d.setMimeType(contentType);
+ d.setResultCode(statusCode);
+ d.setSize(contentLength);
+ d.setTitle(title);
+ d.setLastModified(date);
+ }
+ else
+ {
+ d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostResolver);
+ }
+ d.addField("content", fullCharBuffer);
+ d.addField("contentBytes", fullBuffer);
+ docStorage.store(d);
+ }
+
+ log.log("scanned");
+ }
+
+ log.log("stored");
}
else
{
@@ -421,15 +577,22 @@
//errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
taskState.setState(FT_STORING, ipURL);
linkStorage.storeLinks(foundUrls);
- WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
- d.addField("content", fullBuffer);
+ WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(),
+ /*
+ * contentLength
+ */
+ 0, title, date, hostResolver);
+ //d.addField("content", fullBuffer);
+ //d.addField("content", null);
docStorage.store(d);
}
- log.log("scanned");
- }
-
- log.log("stored");
+ break;
}
+ /*
+ * switch
+ */
+ //conn.stop(); // close connection. todo: Do some caching...
+
}
catch (InterruptedIOException e)
{
@@ -444,7 +607,7 @@
//System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
}
- catch(NoRouteToHostException e)
+ catch (NoRouteToHostException e)
{
// router is down or firewall prevents to connect
hi.setReachable(false);
@@ -453,7 +616,7 @@
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
}
- catch(ConnectException e)
+ catch (ConnectException e)
{
// no server is listening at this port
hi.setReachable(false);
@@ -461,6 +624,7 @@
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
+
}
catch (SocketException e)
{
@@ -469,7 +633,7 @@
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());

}
- catch(UnknownHostException e)
+ catch (UnknownHostException e)
{
// IP Address not to be determined
hi.setReachable(false);
@@ -500,10 +664,10 @@
e.printStackTrace();
System.out.println("[" + threadNr + "]: stopping");
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping");
-
}
finally
{
+ hi.releaseLock();

if (isInterrupted)
{
@@ -521,7 +685,6 @@
*/
taskState.setState(FT_CLOSING);
conn.stop();
-
taskState.setState(FT_READY);
foundUrls = null;
}
@@ -529,7 +692,8 @@

/**
* the interrupt method. not in use since the change to HTTPClient
- * @TODO decide if we need this anymore
+ *
+ * @TODO decide if we need this anymore
*/
public void interrupt()
{
@@ -563,11 +727,12 @@


/**
- * this is called whenever a link was found in the current document,
- * Don't create too many objects here, as this will be called
- * millions of times
+ * this is called whenever a link was found in the current document, Don't
+ * create too many objects here, as this will be called millions of times
*
- * @param link Description of the Parameter
+ * @param link Description of the Parameter
+ * @param anchor Description of the Parameter
+ * @param isFrame Description of the Parameter
*/
public void handleLink(String link, String anchor, boolean isFrame)
{
@@ -599,8 +764,11 @@
// relative url
url = new URL(base, link);
}
-
- URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor, hostManager);
+ if(url.getPath() == null || url.getPath().length() == 0)
+ {
+ url = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/" + url.getFile());
+ }
+ URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hostResolver);

//String urlString = urlMessage.getURLString();

@@ -669,6 +837,11 @@
* {
* /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)");
* }
+ */
+ /**
+ * Gets the bytesRead attribute of the FetcherTask object
+ *
+ * @return The bytesRead value
*/
public long getBytesRead()
{




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>