Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher RobotExclusionFilter.java
cmarschner 2002/10/22 08:15:07

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
RobotExclusionFilter.java
Log:
improved logging

Revision Changes Path
1.4 +34 -20 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java

Index: RobotExclusionFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- RobotExclusionFilter.java 17 Jun 2002 13:59:28 -0000 1.3
+++ RobotExclusionFilter.java 22 Oct 2002 15:15:07 -0000 1.4
@@ -121,9 +121,9 @@
*/
public RobotExclusionFilter(HostManager hm)
{
- log = new SimpleLogger("RobotExclusionFilter");
+ log = new SimpleLogger("RobotExclusionFilter", true);
hostManager = hm;
- rePool = new ThreadPool(2, new REFThreadFactory());
+ rePool = new ThreadPool(5, new REFThreadFactory());
rePool.init();
log.setFlushAtOnce(true);
log.log("refilter: initialized");
@@ -164,19 +164,21 @@
// assert message instanceof URLMessage;
URLMessage urlMsg = ((URLMessage) message);
URL url = urlMsg.getUrl();
+// String urlString = urlMsg.getNormalizedURLString();
+// URL nUrl = new URL(urlString);
//assert url != null;
- HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
- if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
+ HostInfo h = hostManager.getHostInfo(url.getHost());
+ synchronized (h)
{
- log.logThreadSafe("handleRequest: starting to get robots.txt");
- // probably this results in Race Conditions here
+ if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
+ {
+ log.logThreadSafe("handleRequest: starting to get robots.txt");
+ // probably this results in Race Conditions here

- rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
- h.setLoadingRobotsTxt(true);
- }
+ rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
+ h.setLoadingRobotsTxt(true);
+ }

- synchronized (h)
- {
// isLoading...() and queuedRequest.insert() must be atomic
if (h.isLoadingRobotsTxt())
{
@@ -271,8 +273,16 @@
*/
public void run(ServerThread thread)
{
- // assert hostInfo != null;
String threadName = Thread.currentThread().getName();
+ synchronized(hostInfo)
+ {
+ if(hostInfo.isRobotTxtChecked())
+ {
+ log.logThreadSafe("task " + threadName + ": already loaded " + hostInfo.getHostName());
+ return; // may happen 'cause check is not synchronized
+ }
+ }
+ // assert hostInfo != null;

log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
//hostInfo.setLoadingRobotsTxt(true);
@@ -290,6 +300,7 @@
if (res.getStatusCode() != 200)
{
errorOccured = true;
+ log.log("task " + threadName + ": return code was " + res.getStatusCode());
}
else
{
@@ -309,26 +320,26 @@
catch (java.net.UnknownHostException e)
{
hostInfo.setReachable(false);
- log.logThreadSafe("task " + threadName + ": unknown host. setting to unreachable");
+ log.logThreadSafe("task " + threadName + ": unknown host '" + hostInfo.getHostName() + "'. setting to unreachable");
errorOccured = true;
}
catch (java.net.NoRouteToHostException e)
{
hostInfo.setReachable(false);
- log.logThreadSafe("task " + threadName + ": no route to. setting to unreachable");
+ log.logThreadSafe("task " + threadName + ": no route to '"+hostInfo.getHostName()+"'. setting to unreachable");
errorOccured = true;
}
catch (java.net.ConnectException e)
{
hostInfo.setReachable(false);
- log.logThreadSafe("task " + threadName + ": connect exception. setting to unreachable");
+ log.logThreadSafe("task " + threadName + ": connect exception while connecting to '"+hostInfo.getHostName()+"'. setting to unreachable");
errorOccured = true;
}
catch (java.io.InterruptedIOException e)
{
// time out. fatal in this case
hostInfo.setReachable(false);
- log.logThreadSafe("task " + threadName + ": time out. setting to unreachable");
+ log.logThreadSafe("task " + threadName + ": time out while connecting to '" +hostInfo.getHostName() + "'. setting to unreachable");
errorOccured = true;
}

@@ -343,19 +354,20 @@
{
if (errorOccured)
{
+ log.logThreadSafe("task " + threadName + ": error occured. putback...");
synchronized (hostInfo)
{
hostInfo.setRobotsChecked(true, null);
// crawl everything
hostInfo.setLoadingRobotsTxt(false);
- log.logThreadSafe("task " + threadName + ": error occured");
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
- hostInfo.setLoadingRobotsTxt(false);
+ //hostInfo.setLoadingRobotsTxt(false);
putBackURLs();
}
}
else
{
+ log.logThreadSafe("task " + threadName + ": finished. putback...");
synchronized (hostInfo)
{
hostInfo.setRobotsChecked(true, disallows);
@@ -374,11 +386,13 @@
*/
private void putBackURLs()
{
+
+ int qSize = hostInfo.getQueueSize();
while (hostInfo.getQueueSize() > 0)
{
messageHandler.putMessage((Message) hostInfo.removeFromQueue());
}
- log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
+ log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished. put " + qSize + " URLs back");
hostInfo.removeQueue();
}





--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>