Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTaskQueue.java FetcherThread.java FetcherThreadFactory.java RobotExclusionFilter.java ThreadMonitor.java URLMessage.java URLVi
cmarschner 2002/06/17 06:59:29

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
Fetcher.java FetcherMain.java FetcherTaskQueue.java
FetcherThread.java FetcherThreadFactory.java
RobotExclusionFilter.java ThreadMonitor.java
URLMessage.java URLVisitedFilter.java
Removed: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
HostInfo.java HostManager.java
Log:
added URLNormalizer. Changed filters to use normalized URLs if possible

Revision Changes Path
1.4 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java

Index: Fetcher.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Fetcher.java 1 Jun 2002 18:55:15 -0000 1.3
+++ Fetcher.java 17 Jun 2002 13:59:28 -0000 1.4
@@ -65,6 +65,7 @@
import java.util.LinkedList;

import de.lanlab.larm.fetcher.FetcherTask;
+import de.lanlab.larm.net.*;

/**
* filter class; the Fetcher is the main class which keeps the ThreadPool that



1.4 +3 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java

Index: FetcherMain.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FetcherMain.java 1 Jun 2002 18:55:15 -0000 1.3
+++ FetcherMain.java 17 Jun 2002 13:59:28 -0000 1.4
@@ -62,6 +62,7 @@
import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*;
+import de.lanlab.larm.net.*;
import javax.swing.UIManager;
import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException;
@@ -278,7 +279,7 @@
{
try
{
- messageHandler.putMessage(new URLMessage(url, null, isFrame, null));
+ messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
}
catch (Exception e)
{



1.3 +16 -15 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java

Index: FetcherTaskQueue.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FetcherTaskQueue.java 22 May 2002 23:09:17 -0000 1.2
+++ FetcherTaskQueue.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -186,16 +186,17 @@
public static void main(String args[])
{
FetcherTaskQueue q = new FetcherTaskQueue();
+ de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10);
System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
try
{
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
}
catch (Throwable t)
{
@@ -217,9 +218,9 @@
try
{
System.out.println("put 3 lmus.");
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
System.out.print("pull out 1st element [lmu/1]: ");
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [2]: " + q.size());
@@ -227,9 +228,9 @@
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [1]: " + q.size());
System.out.println("put in 3 yahoos");
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [3]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
@@ -237,7 +238,7 @@
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size());
System.out.println("put in another Yahoo");
- q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
+ q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());



1.3 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java

Index: FetcherThread.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FetcherThread.java 22 May 2002 23:09:17 -0000 1.2
+++ FetcherThread.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -56,6 +56,7 @@

import de.lanlab.larm.threads.ServerThread;
import de.lanlab.larm.util.State;
+import de.lanlab.larm.net.HostManager;

/**
* a server thread for the thread pool that records the number



1.3 +75 -58 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java

Index: FetcherThreadFactory.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FetcherThreadFactory.java 22 May 2002 23:09:17 -0000 1.2
+++ FetcherThreadFactory.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -1,64 +1,69 @@
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
+/*
+ * ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
*/
-
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.*;
+import de.lanlab.larm.net.*;

/**
- * this factory simply creates fetcher threads. It's passed
- * to the ThreadPool because the pool is creating the threads on its own
- * @version $Id$
+ * this factory simply creates fetcher threads. It's passed to the ThreadPool
+ * because the pool is creating the threads on its own
+ *
+ * @author Administrator
+ * @created 14. Juni 2002
+ * @version $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17
+ * cmarschner Exp $
*/
public class FetcherThreadFactory extends ThreadFactory
{
@@ -69,16 +74,28 @@

HostManager hostManager;

+
+ /**
+ * Constructor for the FetcherThreadFactory object
+ *
+ * @param hostManager Description of the Parameter
+ */
public FetcherThreadFactory(HostManager hostManager)
{
this.hostManager = hostManager;
}


- public ServerThread createServerThread(int count)
+ /**
+ * Description of the Method
+ *
+ * @param count Description of the Parameter
+ * @return Description of the Return Value
+ */
+ public ServerThread createServerThread(int count)
{
ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
newThread.setPriority(4);
return newThread;
}
-}
\ No newline at end of file
+}



1.3 +14 -13 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java

Index: RobotExclusionFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- RobotExclusionFilter.java 22 May 2002 23:09:17 -0000 1.2
+++ RobotExclusionFilter.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -63,6 +63,7 @@
import de.lanlab.larm.util.*;
import de.lanlab.larm.threads.*;
import HTTPClient.*;
+import de.lanlab.larm.net.*;

/**
* this factory simply creates fetcher threads. It's gonna be passed to the
@@ -164,13 +165,13 @@
URLMessage urlMsg = ((URLMessage) message);
URL url = urlMsg.getUrl();
//assert url != null;
- HostInfo h = hostManager.getHostInfo(url.getHost());
+ HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
{
log.logThreadSafe("handleRequest: starting to get robots.txt");
// probably this results in Race Conditions here

- rePool.doTask(new RobotExclusionTask(h), new Integer(h.id));
+ rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
h.setLoadingRobotsTxt(true);
}

@@ -182,7 +183,7 @@

//log.logThreadSafe("handleRequest: other thread is loading");
// assert h.queuedRequests != null
- h.queuedRequests.insert(message);
+ h.insertIntoQueue(message);
// not thread safe
log.logThreadSafe("handleRequest: queued file " + url);
return null;
@@ -273,14 +274,14 @@
// assert hostInfo != null;
String threadName = Thread.currentThread().getName();

- log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName);
+ log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
//hostInfo.setLoadingRobotsTxt(true);
String[] disallows = null;
boolean errorOccured = false;
try
{
log.logThreadSafe("task " + threadName + ": getting connection");
- HTTPConnection conn = new HTTPConnection(hostInfo.hostName);
+ HTTPConnection conn = new HTTPConnection(hostInfo.getHostName());
conn.setTimeout(30000);
// wait at most 20 secs

@@ -348,8 +349,8 @@
// crawl everything
hostInfo.setLoadingRobotsTxt(false);
log.logThreadSafe("task " + threadName + ": error occured");
- log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
- hostInfo.isLoadingRobotsTxt = false;
+ log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
+ hostInfo.setLoadingRobotsTxt(false);
putBackURLs();
}
}
@@ -359,8 +360,8 @@
{
hostInfo.setRobotsChecked(true, disallows);
log.logThreadSafe("task " + threadName + ": done");
- log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
- hostInfo.isLoadingRobotsTxt = false;
+ log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
+ hostInfo.setLoadingRobotsTxt(false);
putBackURLs();
}
}
@@ -373,12 +374,12 @@
*/
private void putBackURLs()
{
- while (hostInfo.queuedRequests.size() > 0)
+ while (hostInfo.getQueueSize() > 0)
{
- messageHandler.putMessage((Message) hostInfo.queuedRequests.remove());
+ messageHandler.putMessage((Message) hostInfo.removeFromQueue());
}
log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
- hostInfo.queuedRequests = null;
+ hostInfo.removeQueue();
}





1.3 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java

Index: ThreadMonitor.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- ThreadMonitor.java 22 May 2002 23:09:17 -0000 1.2
+++ ThreadMonitor.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -61,6 +61,7 @@
import java.io.*;
import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLoggerManager;
+import de.lanlab.larm.net.*;

/**
* this monitor takes a sample of every thread every x milliseconds,



1.3 +177 -60 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java

Index: URLMessage.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- URLMessage.java 22 May 2002 23:09:17 -0000 1.2
+++ URLMessage.java 17 Jun 2002 13:59:28 -0000 1.3
@@ -1,66 +1,71 @@
-/* ====================================================================
- * The Apache Software License, Version 1.1
+/*
+ * ====================================================================
+ * The Apache Software License, Version 1.1
*
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
*/
-
package de.lanlab.larm.fetcher;

import java.net.*;
import java.io.*;
import de.lanlab.larm.util.URLUtils;
+import de.lanlab.larm.net.URLNormalizer;
+import de.lanlab.larm.net.HostManager;

/**
* represents a URL which is passed around in the messageHandler
- * @version $Id$
+ *
+ * @author Administrator
+ * @created 14. Juni 2002
+ * @version $Id$
*/
public class URLMessage implements Message, Serializable
{
@@ -68,14 +73,51 @@
* the URL
*/
protected URL url;
- protected String urlString;

+ /**
+ * Description of the Field
+ */
+ protected volatile String urlString;
+
+ /**
+ * referer or null
+ */
protected URL referer;
- protected String refererString;
+
+ /**
+ * externalized referer URL, to prevent multiple calls to url.toExternalForm()
+ */
+ protected volatile String refererString;
+
+ /**
+ * externalized referer URL, to prevent multiple calls to url.toExternalForm()
+ */
+ protected volatile String refererNormalizedString;
+
+ /**
+ * normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer}
+ * (lower case, index.* removed, all characters except alphanumeric ones escaped)
+ */
+ protected String normalizedURLString;
+
+
boolean isFrame;
+
+ /**
+ * anchor text, as in &lt;a href="..."&gt;Anchor&lt;/a&gt;
+ */
protected String anchor;

- public URLMessage(URL url, URL referer, boolean isFrame, String anchor)
+
+ /**
+ * Constructor for the URLMessage object
+ *
+ * @param url Description of the Parameter
+ * @param referer Description of the Parameter
+ * @param isFrame Description of the Parameter
+ * @param anchor Description of the Parameter
+ */
+ public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager)
{
//super();
this.url = url;
@@ -83,69 +125,144 @@

this.referer = referer;
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
+ this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null;
this.isFrame = isFrame;
this.anchor = anchor != null ? anchor : "";
+ this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager));
+ //this.normalizedURLString = URLNormalizer.
//System.out.println("" + refererString + " -> " + urlString);
}

+ public String getNormalizedURLString()
+ {
+ return this.normalizedURLString;
+ }
+
+ /**
+ * Gets the url attribute of the URLMessage object
+ *
+ * @return The url value
+ */
public URL getUrl()
{
return this.url;
}

+
+ /**
+ * Gets the referer attribute of the URLMessage object
+ *
+ * @return The referer value
+ */
public URL getReferer()
{
return this.referer;
}


+ /**
+ * Description of the Method
+ *
+ * @return Description of the Return Value
+ */
public String toString()
{
return urlString;
}

+
+ /**
+ * Gets the uRLString attribute of the URLMessage object
+ *
+ * @return The uRLString value
+ */
public String getURLString()
{
return urlString;
}

+
+ /**
+ * Gets the refererString attribute of the URLMessage object
+ *
+ * @return The refererString value
+ */
public String getRefererString()
{
return refererString;
}

+
+ /**
+ * Gets the anchor attribute of the URLMessage object
+ *
+ * @return The anchor value
+ */
public String getAnchor()
{
return anchor;
}


+ /**
+ * Description of the Method
+ *
+ * @return Description of the Return Value
+ */
public int hashCode()
{
return url.hashCode();
}

- private void writeObject(java.io.ObjectOutputStream out) throws IOException
+
+ /**
+ * Description of the Method
+ *
+ * @param out Description of the Parameter
+ * @exception IOException Description of the Exception
+ */
+ private void writeObject(java.io.ObjectOutputStream out)
+ throws IOException
{
out.writeObject(url);
out.writeObject(referer);
out.writeBoolean(isFrame);
out.writeUTF(anchor);
+ out.writeUTF(refererNormalizedString);
+ out.writeUTF(normalizedURLString);
+
}

- private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException
+
+ /**
+ * Description of the Method
+ *
+ * @param in Description of the Parameter
+ * @exception IOException Description of the Exception
+ * @exception ClassNotFoundException Description of the Exception
+ */
+ private void readObject(java.io.ObjectInputStream in)
+ throws IOException, ClassNotFoundException
{
- url = (URL)in.readObject();
- referer = (URL)in.readObject();
+ url = (URL) in.readObject();
+ referer = (URL) in.readObject();
urlString = url.toExternalForm();
refererString = referer.toExternalForm();
isFrame = in.readBoolean();
anchor = in.readUTF();
+ refererNormalizedString = in.readUTF();
+ normalizedURLString = in.readUTF();
}

+
+ /**
+ * Gets the info attribute of the URLMessage object
+ *
+ * @return The info value
+ */
public String getInfo()
{
- return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
+ return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
}

}



1.4 +2 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java

Index: URLVisitedFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- URLVisitedFilter.java 1 Jun 2002 18:55:15 -0000 1.3
+++ URLVisitedFilter.java 17 Jun 2002 13:59:28 -0000 1.4
@@ -123,7 +123,7 @@
{
URLMessage urlMessage = ((URLMessage) message);
URL url = urlMessage.getUrl();
- String urlString = urlMessage.getURLString();
+ String urlString = urlMessage.getNormalizedURLString();
if (urlHash.contains(urlString))
{
//System.out.println("URLVisitedFilter: " + urlString + " already present.");




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>