Mailing List Archive: cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherMain.java

cmarschner 2002/10/22 08:05:07

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
FetcherMain.java
Log:
more than one start URL; hostResolver

Revision Changes Path
1.7 +186 -46 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java

Index: FetcherMain.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- FetcherMain.java 15 Sep 2002 00:38:14 -0000 1.6
+++ FetcherMain.java 22 Oct 2002 15:05:07 -0000 1.7
@@ -56,12 +56,12 @@

import de.lanlab.larm.threads.ThreadPoolObserver;
import de.lanlab.larm.threads.ThreadPool;
-import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*;
import de.lanlab.larm.net.*;
import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException;
+import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
@@ -110,6 +110,17 @@
protected RobotExclusionFilter reFilter;

/**
+ * the host manager keeps track of all hosts and is used by the filters.
+ */
+ protected HostManager hostManager;
+
+ /**
+ * the host resolver can change a host that occurs within a URL to a different
+ * host, depending on the rules specified in a configuration file
+ */
+ protected HostResolver hostResolver;
+
+ /**
* this rather flaky filter just filters out some URLs, i.e. different views
* of Apache the apache DirIndex module. Has to be made
* configurable in near future
@@ -122,10 +133,6 @@
*/
protected URLLengthFilter urlLengthFilter;

- /**
- * the host manager keeps track of all hosts and is used by the filters.
- */
- protected HostManager hostManager;

/**
* this is the main document fetcher. It contains a thread pool that fetches the
@@ -152,7 +159,7 @@
*
* @param nrThreads number of fetcher threads to be created
*/
- private FetcherMain(int nrThreads)
+ public FetcherMain(int nrThreads, String hostResolverFile) throws Exception
{
// to make things clear, this method is commented a bit better than
// the rest of the program...
@@ -172,44 +179,73 @@

// the LogStorage used here does extensive logging. It logs all links and
// document information.
- // it also saves all documents to page files. Probably this single storage
- // could also be replaced by a pipeline; or even incorporated into the
- // existing message pipeline
- SimpleLogger storeLog = new SimpleLogger("store", false);
- SimpleLogger linksLog = new SimpleLogger("links", false);
-
+ // it also saves all documents to page files.
+ File logsDir = new File("logs");
+ logsDir.mkdir(); // ensure log directory exists
+
+ // in this experimental implementation, the crawler is pretty verbose
+ // the SimpleLogger, however, is a FlyWeight logger which is buffered and
+ // not thread safe by default
+ SimpleLogger storeLog = new SimpleLogger("store", /* add date/time? */ false);
+ SimpleLogger visitedLog = new SimpleLogger("URLVisitedFilter", /* add date/time? */ false);
+ SimpleLogger scopeLog = new SimpleLogger("URLScopeFilter", /* add date/time? */ false);
+ SimpleLogger pathsLog = new SimpleLogger("KnownPathsFilter", /* add date/time? */ false);
+ SimpleLogger linksLog = new SimpleLogger("links", /* add date/time? */ false);
+ SimpleLogger lengthLog = new SimpleLogger("length", /* add date/time? */ false);

StoragePipeline storage = new StoragePipeline();
- //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
+
+
+ // in the default configuration, the crawler will only save the document
+ // information to store.log and the link information to links.log
+ // The contents of the files are _not_ saved. If you set
+ // "save in page files" to "true", they will be saved in "page files",
+ // binary files each containing a set of documents. Here, the
+ // maximum file size is ~50 MB (crawled files won't be split up into different
+ // files). The logs/store.log file contains pointers to these files: a page
+ // file number, the offset within that file, and the document's length
+
+ // FIXME: default constructor for all storages + bean access methods
+ storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
+ /* page file prefix */ "logs/pagefile"));
storage.addLinkStorage(new LinkLogStorage(linksLog));
storage.addLinkStorage(messageHandler);
-
+ /*
+ // experimental Lucene storage. will slow the crawler down *a lot*
LuceneStorage luceneStorage = new LuceneStorage();
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
luceneStorage.setCreate(true);
// FIXME: index name and path need to be configurable
luceneStorage.setIndexName("luceneIndex");
+ // the field names come from URLMessage.java and WebDocument.java. See
+ // LuceneStorage source for details
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
storage.addDocStorage(luceneStorage);
+ */
+
storage.open();

//storage.addStorage(new JMSStorage(...));

- // a third example would be the NullStorage, which converts the documents into
- // heat, which evaporates above the processor
- // NullStorage();
+ // create the filters and add them to the message queue
+ urlScopeFilter = new URLScopeFilter(scopeLog);

+ // dnsResolver = new DNSResolver();
hostManager = new HostManager(1000);
+ hostResolver = new HostResolver();
+ hostResolver.initFromFile(hostResolverFile);
+ hostManager.setHostResolver(hostResolver);
+
+// hostManager.addSynonym("www.fachsprachen.uni-muenchen.de", "www.fremdsprachen.uni-muenchen.de");
+// hostManager.addSynonym("www.uni-muenchen.de", "www.lmu.de");
+// hostManager.addSynonym("www.uni-muenchen.de", "uni-muenchen.de");
+// hostManager.addSynonym("webinfo.uni-muenchen.de", "www.webinfo.uni-muenchen.de");
+// hostManager.addSynonym("webinfo.uni-muenchen.de", "webinfo.campus.lmu.de");
+// hostManager.addSynonym("www.s-a.uni-muenchen.de", "s-a.uni-muenchen.de");

- // create the filters and add them to the message queue
reFilter = new RobotExclusionFilter(hostManager);
- urlScopeFilter = new URLScopeFilter();
- urlVisitedFilter = new URLVisitedFilter(100000);
- knownPathsFilter = new KnownPathsFilter();
- urlLengthFilter = new URLLengthFilter(255);

- // dnsResolver = new DNSResolver();
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);

// prevent message box popups
@@ -218,6 +254,8 @@
// prevent GZipped files from being decoded
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);

+ urlVisitedFilter = new URLVisitedFilter(visitedLog, 100000);
+
// initialize the threads
fetcher.init();

@@ -241,12 +279,15 @@
messageHandler.addListener(reFilter);
messageHandler.addListener(urlVisitedFilter);
messageHandler.addListener(knownPathsFilter);
+
messageHandler.addListener(fetcher);

- /* uncomment this to enable HTTPClient logging
+ //uncomment this to enable HTTPClient logging
+ /*
try
{
- HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
+ HTTPClient.Log.setLogWriter(new java.io.OutputStreamWriter(System.out) //new java.io.FileWriter("logs/HttpClient.log")
+ ,false);
HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
}
catch (Exception e)
@@ -254,6 +295,7 @@
e.printStackTrace();
}
*/
+
}

@@ -276,11 +318,11 @@
* @exception java.net.MalformedURLException Description of Exception
*/
public void putURL(URL url, boolean isFrame)
- throws java.net.MalformedURLException
+ // throws java.net.MalformedURLException
{
try
{
- messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
+ messageHandler.putMessage(new URLMessage(url, null, isFrame == true ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, null, this.hostResolver));
}
catch (Exception e)
{
@@ -288,7 +330,6 @@
System.out.println("Exception: " + e.getMessage());
e.printStackTrace();
}
- //System.out.println("URLs geschrieben");
}

@@ -341,24 +382,69 @@
*
* @param args The command line arguments
*/
- public static void main(String[] args)
+ public static void main(String[] args) throws Exception
{
int nrThreads = 10;

- String startURL = "";
- String restrictTo = "http://141.84.120.82/ll/cmarschn/.*";
+ ArrayList startURLs = new ArrayList();
+ String restrictTo = ".*";
boolean gui = false;
boolean showInfo = false;
- System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
-
+ String hostResolverFile = "";
+ System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - B.20020914");
// FIXME: consider using Jakarta Commons' CLI package for command line parameters
+
for (int i = 0; i < args.length; i++)
{
if (args[i].equals("-start"))
{
i++;
- startURL = args[i];
- System.out.println("Start-URL set to: " + startURL);
+ String arg = args[i];
+ if(arg.startsWith("@"))
+ {
+ // input is a file with one URL per line
+ String fileName = arg.substring(1);
+ System.out.println("reading URL file " + fileName);
+ try
+ {
+ BufferedReader r = new BufferedReader(new FileReader(fileName));
+ String line;
+ int count=0;
+ while((line = r.readLine()) != null)
+ {
+ try
+ {
+ startURLs.add(new URL(line));
+ count++;
+ }
+ catch (MalformedURLException e)
+ {
+ System.out.println("Malformed URL '" + line + "' in line " + (count+1) + " of file " + fileName);
+
+ }
+ }
+ r.close();
+ System.out.println("added " + count + " URLs from " + fileName);
+ }
+ catch(IOException e)
+ {
+ System.out.println("Couldn't read '" + fileName + "': " + e);
+ }
+ }
+ else
+ {
+ System.out.println("got URL " + arg);
+ try
+ {
+ startURLs.add(new URL(arg));
+ System.out.println("Start-URL added: " + arg);
+ }
+ catch (MalformedURLException e)
+ {
+ System.out.println("Malformed URL '" + arg + "'");
+
+ }
+ }
}
else if (args[i].equals("-restrictto"))
{
@@ -372,6 +458,13 @@
nrThreads = Integer.parseInt(args[i]);
System.out.println("Threads set to " + nrThreads);
}
+ else if (args[i].equals("-hostresolver"))
+ {
+ i++;
+ hostResolverFile = args[i];
+ System.out.println("reading host resolver props from '" + hostResolverFile + "'");
+
+ }
else if (args[i].equals("-gui"))
{
gui = true;
@@ -390,10 +483,60 @@
//URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
// replaced by HTTPClient

- FetcherMain f = new FetcherMain(nrThreads);
- if (showInfo || (startURL.equals("") && gui == false))
+ FetcherMain f = new FetcherMain(nrThreads, hostResolverFile);
+ if (showInfo || "".equals(hostResolverFile) || (startURLs.isEmpty() && gui == false))
{
- System.out.println("Usage: FetcherMain -start <URL> -restrictto <RegEx> [-threads <nr=10>]"); // [-gui]
+ System.out.println("The LARM crawler\n" +
+ "\n" +
+ "The LARM crawler is a fast parallel crawler, currently designed for\n" +
+ "large intranets (up to a couple hundred hosts with some hundred thousand\n" +
+ "documents). It is currently restricted by a relatively high memory overhead\n" +
+ "per crawled host, and by a HashMap of already crawled URLs which is also held\n" +
+ "in memory.\n" +
+ "\n" +
+ "Usage: FetcherMain <-start <URL>|@<filename>>+ -restrictto <RegEx>\n" +
+ " [-threads <nr=10>] [-hostresolver <filename>]\n" +
+ "\n" +
+ "Commands:\n" +
+ " -start specify one or more URLs to start with. You can as well specify a file" +
+ " that contains URLs, one each line\n" +
+ " -restrictto a Perl 5 regular expression each URL must match. It is run against the\n" +
+ " _complete_ URL, including the http:// part\n" +
+ " -threads the number of crawling threads. defaults to 10\n" +
+ " -hostresolver specify a file that contains rules for changing the host part of \n" +
+ " a URL during the normalization process (experimental).\n" +
+ "Caution: The <RegEx> is applied to the _normalized_ form of a URL.\n" +
+ " See URLNormalizer for details\n" +
+ "Example:\n" +
+ " -start @urls1.txt -start @urls2.txt -start http://localhost/ " +
+ " -restrictto http://[^/]*\\.localhost/.* -threads 25\n" +
+ "\n" +
+ "The host resolver file may contain the following commands: \n" +
+ " startsWith(part1) = part2\n" +
+ " if host starts with part1, this part will be replaced by part2\n" +
+ " endsWith(part1) = part2\n" +
+ " if host ends with part1, this part will be replaced by part2. This is done after\n" +
+ " startsWith was processed\n" +
+ " synonym(host1) = host2\n" +
+ " the keywords startsWith, endsWith and synonym are case sensitive\n" +
+ " host1 will be replaced with host2. this is done _after_ startsWith and endsWith was \n" +
+ " processed. Due to a bug in BeanUtils, dots are not allowed in the keys (in parentheses)\n" +
+ " and have to be escaped with commas. To simplify, commas are also replaced in property \n" +
+ " values. So just use commas instead of dots. The resulting host names are only used for \n" +
+ " comparisons and do not have to be existing URLs (although the syntax has to be valid).\n" +
+ " However, the names will often be passed to java.net.URL which will try to make a DNS name\n" +
+ " resolution, which will time out if the server can't be found. \n" +
+ " Example:" +
+ " synonym(www1,host,com) = host,com\n" +
+ " startsWith(www,) = ,\n" +
+ " endsWith(host1,com) = host,com\n" +
+ "The crawler will show a status message every 5 seconds, which is printed by ThreadMonitor.java\n" +
+ "It will stop after the ThreadMonitor found the message queue and the crawling threads to be idle a \n" +
+ "couple of times.\n" +
+ "The crawled data will be saved within a logs/ directory. A cachingqueue/ directory is used for\n" +
+ "temporary queues.\n" +
+ "Note that this implementation is experimental, and that the command line options cover only a part \n" +
+ "of the parameters. Much of the configuration can only be done by modifying FetcherMain.java\n");
System.exit(0);
}
try
@@ -403,17 +546,14 @@
if (gui)
{
// f.initGui(f, startURL);
+ // the GUI is not longer supported
}
else
{
- try
- {
- f.startMonitor();
- f.putURL(new URL(startURL), false);
- }
- catch (MalformedURLException e)
+ f.startMonitor();
+ for(Iterator it = startURLs.iterator(); it.hasNext(); )
{
- System.out.println("Malformed URL");
+ f.putURL((URL)it.next(), false);
}
}
}

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>