Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTask.java
cmarschner 2002/06/17 17:45:10

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
Fetcher.java FetcherMain.java FetcherTask.java
Log:
added experimental version of LuceneStorage

Revision Changes Path
1.5 +7 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java

Index: Fetcher.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- Fetcher.java 17 Jun 2002 13:59:28 -0000 1.4
+++ Fetcher.java 18 Jun 2002 00:45:10 -0000 1.5
@@ -93,6 +93,11 @@
*/
DocumentStorage storage;

+ /**
+ * the storage where the links are saved to
+ */
+ LinkStorage linkStorage;
+
/**
* the host manager keeps track of host information
*/
@@ -110,6 +115,7 @@
public Fetcher(int maxThreads, DocumentStorage docStorage, LinkStorage linkStorage, HostManager hostManager)
{
this.storage = storage;
+ this.linkStorage = linkStorage;
FetcherTask.setDocStorage(docStorage);
FetcherTask.setLinkStorage(linkStorage);
fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager));



1.5 +12 -3 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java

Index: FetcherMain.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- FetcherMain.java 17 Jun 2002 13:59:28 -0000 1.4
+++ FetcherMain.java 18 Jun 2002 00:45:10 -0000 1.5
@@ -183,10 +183,19 @@


StoragePipeline storage = new StoragePipeline();
- storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
+ //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
storage.addLinkStorage(new LinkLogStorage(linksLog));
storage.addLinkStorage(messageHandler);
- //storage.addStorage(new LuceneStorage(...));
+
+ LuceneStorage luceneStorage = new LuceneStorage();
+ luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
+ luceneStorage.setCreate(true);
+ luceneStorage.setIndexName("luceneIndex");
+ luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
+ luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
+ storage.addDocStorage(luceneStorage);
+ storage.open();
+
//storage.addStorage(new JMSStorage(...));

// a third example would be the NullStorage, which converts the documents into



1.5 +23 -13 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java

Index: FetcherTask.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- FetcherTask.java 17 Jun 2002 13:58:33 -0000 1.4
+++ FetcherTask.java 18 Jun 2002 00:45:10 -0000 1.5
@@ -80,6 +80,8 @@
* the storage. If it's an HTML document, it will be parsed and all links will
* be put into the message handler again.
*
+ * stores contents of the files in field "contents"
+ *
* @author Clemens Marschner
* @version $Id$
*/
@@ -406,32 +408,40 @@
Tokenizer tok = new Tokenizer();
tok.setLinkHandler(this);
tok.parse(new SimpleCharArrayReader(fullCharBuffer));
+
+ taskState.setState(FT_STORING, ipURL);
+ linkStorage.storeLinks(foundUrls);
+ WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
+ d.addField("content", fullCharBuffer);
+ docStorage.store(d);
}
else
{
// System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
- errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
+ //errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
+ taskState.setState(FT_STORING, ipURL);
+ linkStorage.storeLinks(foundUrls);
+ WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
+ d.addField("content", fullBuffer);
+ docStorage.store(d);
}
log.log("scanned");
}
- taskState.setState(FT_STORING, ipURL);
- linkStorage.storeLinks(foundUrls);
- //messageHandler.putMessages(foundUrls);
- docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title, hostManager));
+
log.log("stored");
}
}
catch (InterruptedIOException e)
{
// timeout while reading this file
- System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl());
+ //System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl());
errorLog.log("error: Timeout: " + this.actURLMessage.getUrl());
hi.badRequest();
}
catch (FileNotFoundException e)
{
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
+ //System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
}
catch(NoRouteToHostException e)
@@ -439,7 +449,7 @@
// router is down or firewall prevents to connect
hi.setReachable(false);
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
+ //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
}
@@ -448,14 +458,14 @@
// no server is listening at this port
hi.setReachable(false);
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
+ //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
}
catch (SocketException e)
{
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
+ //System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());

}
@@ -464,7 +474,7 @@
// IP Address not to be determined
hi.setReachable(false);
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
+ //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());

@@ -472,7 +482,7 @@
catch (IOException e)
{
taskState.setState(FT_EXCEPTION);
- System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
+ //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage());





--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>