Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util SimpleStringTokenizer.java StoreLogFile.java
cmarschner 2002/10/22 08:40:48

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/net
HostManager.java
Added: contributions/webcrawler-LARM/src/de/lanlab/larm/net
HostResolver.java
contributions/webcrawler-LARM/src/de/lanlab/larm/util
SimpleStringTokenizer.java StoreLogFile.java
Log:


Revision Changes Path
1.2 +61 -27 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java

Index: HostManager.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- HostManager.java 17 Jun 2002 14:00:13 -0000 1.1
+++ HostManager.java 22 Oct 2002 15:40:48 -0000 1.2
@@ -55,6 +55,11 @@
package de.lanlab.larm.net;

import java.util.HashMap;
+import java.util.*;
+import org.apache.oro.text.perl.*;
+import org.apache.oro.text.regex.*;
+import org.apache.oro.text.*;
+import org.apache.oro.util.*;

/**
* Description of the Class
@@ -67,8 +72,12 @@
{
HashMap hosts;
static int hostCount = 0;
+ HostResolver resolver;


+
+// ArrayList rewriteRules = new ArrayList();
+
/**
* Constructor for the HostInfo object
*
@@ -79,6 +88,20 @@
hosts = new HashMap(initialCapacity);
}

+ public void setHostResolver(HostResolver resolver)
+ {
+ this.resolver = resolver;
+ }
+
+ /**
+ * returns the hostResolver
+ * @return
+ */
+ public HostResolver getHostResolver()
+ {
+ return this.resolver;
+ }
+

/**
* Description of the Method
@@ -88,7 +111,20 @@
*/
public HostInfo put(String hostName)
{
- if (!hosts.containsKey(hostName))
+ if(resolver != null)
+ {
+ return putResolved(hostName, resolver.resolveHost(hostName));
+ }
+ else
+ {
+ return putResolved(hostName, hostName);
+ }
+ }
+
+
+ public HostInfo putResolved(String hostName, String resolvedHostName)
+ {
+ if (!hosts.containsKey(resolvedHostName))
{
int hostID;
synchronized (this)
@@ -96,44 +132,43 @@
hostID = hostCount++;
}
HostInfo hi = new HostInfo(hostName,hostID);
- hosts.put(hostName, hi);
+ hosts.put(resolvedHostName, hi);
//System.out.println("hostManager: + " + hostName);
- if(!hostName.equals(hostName.toLowerCase()))
- {
- try
- {
- throw new Exception();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
+// if(!hostName.equals(hostName.toLowerCase()))
+// {
+// try
+// {
+// throw new Exception();
+// }
+// catch(Exception e)
+// {
+// e.printStackTrace();
+// }
+// }
return hi;
}
return (HostInfo)hosts.get(hostName);
- /*else
- {
- hostID = hosts.get()
- }
- // assert hostID != -1;
- return hostID;*/
-
}


+ public HostInfo getHostInfo(String hostName)
+ {
+ return getHostInfoNormalized(hostName, resolver.resolveHost(hostName));
+ }
+
/**
* Gets the hostID attribute of the HostInfo object
*
* @param hostName Description of the Parameter
* @return The hostID value
*/
- public HostInfo getHostInfo(String hostName)
+ public HostInfo getHostInfoNormalized(String hostName, String normalizedHostName)
{
- HostInfo hi = (HostInfo)hosts.get(hostName);
+ HostInfo hi = (HostInfo)hosts.get(normalizedHostName);
if(hi == null)
{
- return put(hostName);
+// System.out.println("new host: " + normalizedHostName);
+ return putResolved(hostName, normalizedHostName);
}
return hi;
}
@@ -145,9 +180,8 @@

public HostInfo addSynonym(String hostName, String synonym)
{
- HostInfo info = getHostInfo(hostName);
- hosts.put(synonym, info);
- return info;
+ resolver.addSynonym(hostName, synonym);
+ return getHostInfo(hostName);
}





1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostResolver.java

Index: HostResolver.java
===================================================================
package de.lanlab.larm.net;

import java.util.*;
import xxl.collections.*;
import java.io.*;
import org.apache.commons.beanutils.*;
import java.lang.reflect.*;
import org.apache.commons.logging.*;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/




//class LRUCache
//{
// HashMap cache = null;
// LinkedList order = null;
// int max;
//
// public LRUCache(int max)
// {
//
// this.max = max;
// cache = new HashMap((int)(max/0.6));
// order = new LinkedList();
// }
//
// public Object get(Object key)
// {
// return cache.get(key);
// }
//
//
//
// public void put(Object key, Object value)
// {
// if(!cache.containsKey(key))
// {
// if(order.size() > max)
// {
// cache.remove(order.removeLast());
// }
// }
// else
// {
// //assert order.contains(key);
// order.remove(key);
// // quite expensive, probably need a hashed list
// // or something even simpler
// }
// order.addFirst(key);
// cache.put(key, value);
// }
//}

/**
* Uses @link{#resolveHost()} which transforms a host name according to the rules
* Rules are (and executed in this order)
* <ul>
* <li>if host starts with (startsWith), replace this part with (replacement)
* <li>if host ends with (endsWith), replace it with (replacement)
* <li>if host is (synonym), replace it with (replacement)
* </ul>
* the resolver can be configured through a property file, which is loaded by an
* Apache BeanUtils property loader.<p>
* Actually the resolver doesn't do any network calls, so this class can be used
* with any string, if you really need to
* @author Clemens Marschner
* @version 1.0
*/
public class HostResolver
{

HashMap synonym;
public HostResolver()
{
synonym = new HashMap();
}

/**
* convenience method that loads the config from a properties file
* @param fileName a property file
* @throws IOException thrown if fileName is wrong or something went wrong while reading
* @throws InvocationTargetException thrown by java.util.Properties
* @throws IllegalAccessException thrown by java.util.Properties
*/
public void initFromFile(String fileName) throws IOException, InvocationTargetException, IllegalAccessException
{
InputStream in = new FileInputStream(fileName);
Properties p = new Properties();
p.load(in);
in.close();
initFromProperties(p);
}

/**
* populates the synonym, startsWith and endsWith properties with a BeanUtils.populate()
* @param props
* @throws InvocationTargetException
* @throws IllegalAccessException
*/
public void initFromProperties(Properties props) throws InvocationTargetException, IllegalAccessException
{
BeanUtils.populate(this, props);
}

ArrayList startsWithArray = new ArrayList();
int startsWithSize = 0;
ArrayList endsWithArray = new ArrayList();
int endsWithSize = 0;

public String getStartsWith(String name) throws IllegalAccessException
{
throw new IllegalAccessException("brrffz");
}

public void setStartsWith(String name, String rep)
{
addHostStartsWithReplace(name.replace(',','.'), rep.replace(',','.'));
}
public String getEndsWith(String name) throws IllegalAccessException
{
throw new IllegalAccessException("brrffz");
}
public void setEndsWith(String name, String rep)
{
this.addHostEndsWithReplace(name.replace(',','.'), rep.replace(',','.'));
}

public void setSynonym(String name, String syn)
{
addSynonym(name.replace(',','.'), syn.replace(',','.'));
}
public String getSynonym(String name) throws IllegalAccessException
{
throw new IllegalAccessException("brrffz");
}
public void addSynonym(String name, String syn)
{
System.out.println("adding synonym " + name + " -> " + syn);
synonym.put(name, syn);
}

/**
* transforms a host name if a rule is found
* @param hostName
* @return probably changed host name
*/
public String resolveHost(String hostName)
{
if(hostName == null)
{
return null;
}
for(int i=0; i<startsWithSize; i++)
{
String[] test = (String[])startsWithArray.get(i);
if(hostName.startsWith(test[0]))
{
hostName = test[1] + hostName.substring(test[0].length());
break;
}
}
for(int i=0; i<endsWithSize; i++)
{
String[] test = (String[])endsWithArray.get(i);
if(hostName.endsWith(test[0]))
{
hostName = hostName.substring(0, hostName.length() - test[0].length()) + test[1];
break;
}
}
String syn = (String)synonym.get(hostName);
return syn != null ? syn : hostName;
}

public void addHostStartsWithReplace(String startsWith, String replace)
{
System.out.println("adding sw replace " + startsWith + " -> " + replace);
startsWithArray.add(new String[] { startsWith, replace });
startsWithSize++;
}

public void addHostEndsWithReplace(String endsWith, String replace)
{
System.out.println("adding ew replace " + endsWith + " -> " + replace);
endsWithArray.add(new String[] { endsWith, replace });
endsWithSize++;
}

// /** The pattern cache to compile and store patterns */
// private PatternCache __patternCache;
// /** The hashtable to cache higher-level expressions */
// private Cache __expressionCache;
// /** The pattern matcher to perform matching operations. */
// private Perl5Matcher __matcher = new Perl5Matcher();
//
// public void addReplaceRegEx(String findRegEx, String replaceRegEx, boolean greedy)
// {
// int compileOptions = Perl5Compiler.CASE_INSENSITIVE_MASK;
// int numSubstitutions = 1;
// if(greedy)
// {
// numSubstitutions = Util.SUBSTITUTE_ALL;
// }
//
// Pattern compiledPattern = __patternCache.getPattern(findRegEx, compileOptions);
// Perl5Substitution substitution = new Perl5Substitution(replaceRegEx, numInterpolations);
// ParsedSubstitutionEntry entry = new ParsedSubstitutionEntry(compiledPattern, substitution, numSubstitutions);
// __expressionCache.addElement(expression, entry);
//
// result = Util.substitute(__matcher, compiledPattern, substitution,
// input, numSubstitutions);
//
// __lastMatch = __matcher.getMatch();
//
// return result;
// }

}


1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleStringTokenizer.java

Index: SimpleStringTokenizer.java
===================================================================
package de.lanlab.larm.util;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/


/**
* A simple string tokenizer that regards <b>one</b> character as a delimiter.
* Compared to Sun's StringTokenizer, it returns an empty token if two
* subsequent delimiters are found
*
* @author Clemens Marschner
* @created 24. März 2002
*/
public class SimpleStringTokenizer
{

String string;

int currPos;
int maxPos;
char delim;


/**
* Constructor for the SimpleStringTokenizer object
*
* @param string the string to be tokenized
* @param delim the delimiter that splits the string
*/
public SimpleStringTokenizer(String string, char delim)
{
setString(string);
setDelim(delim);
}


/**
* sets the delimiter. The tokenizer is not reset.
*
* @param delim The new delim value
*/
public void setDelim(char delim)
{
this.delim = delim;
}


/**
* sets the string and reinitializes the tokenizer. Allows for reusing the
* tokenizer object
*
* @param string string to be tokenized
*/
public void setString(String string)
{
this.string = string;
reset();

maxPos = string.length() - 1;
}


/**
* resets the tokenizer. It will act like newly created
*/
public void reset()
{
currPos = 0;
}


/**
* returns true if the end is not reached
*
* @return false if the end is reached.
*/
public boolean hasMore()
{
return currPos <= maxPos;
}


/**
* returns the next token from the stream. returns an empty string if the
* end is reached
*
* @return Description of the Return Value
* @see java.util.StringTokenizer#nextToken
*/
public String nextToken()
{
int nextPos = string.indexOf(delim, currPos);
if (nextPos == -1)
{
nextPos = maxPos + 1;
}
String sub;
if (nextPos > currPos)
{
sub = string.substring(currPos, nextPos);
}
else
{
sub = "";
}
currPos = nextPos + 1;
return sub;
}
}




1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/StoreLogFile.java

Index: StoreLogFile.java
===================================================================
package de.lanlab.larm.util;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import java.io.*;
import java.util.*;
import de.lanlab.larm.parser.*;
import java.net.*;
import de.lanlab.larm.fetcher.*;
import de.lanlab.larm.net.*;

/**
* Utility class for accessing page files through the store.log file.
* Works like an iterator
*/
public class StoreLogFile implements Iterator
{

public void remove()
{
throw new UnsupportedOperationException();
}


/**
* @author Clemens Marschner
* @version 1.0
*/
public class PageFileEntry
{
String url;
int pageFileNo;
int resultCode;
String mimeType;
int size;
String title;
int pageFileOffset;
File pageFileDirectory;
boolean hasPageFileEntry;
int isFrame;

class PageFileInputStream extends InputStream
{
InputStream pageFileIS;
long offset;

public PageFileInputStream() throws IOException
{
pageFileIS = new FileInputStream(new File(pageFileDirectory, "pagefile_" + pageFileNo + ".pfl"));
offset = 0;
pageFileIS.skip(pageFileOffset);
}
public int available() throws IOException
{
return Math.min(pageFileIS.available(), (int)(size - offset));
}
public void close() throws IOException
{
pageFileIS.close();
}
public void mark(int readLimit)
{
throw new UnsupportedOperationException();
}
public boolean markSupported()
{
return false;
}
public int read() throws IOException
{
if(offset >= size)
{
return -1;
}
int c = pageFileIS.read();
if(c != -1)
{
offset ++;
}
return c;
}

public int read(byte[] b) throws IOException
{
int len = Math.min((int)(size-offset), b.length);
if(len > 0)
{
len = pageFileIS.read(b, 0, len);
if(len != -1)
{
offset += len;
}
return len;
}
return -1;
}
public int read(byte[] b, int off, int maxLen) throws IOException
{
int len = Math.min(Math.min((int)(size-offset), b.length), maxLen);
if(len > 0)
{
len = pageFileIS.read(b, off, maxLen);
if(len != -1)
{
offset += len;
}
return len;
}
return -1;
}
public long skip(long n) throws IOException
{
n = Math.min(n, size-offset);
n = pageFileIS.skip(n);
if(n > 0)
{
offset+=n;
}
return n;
}



}

public PageFileEntry(String storeLogLine, File pageFileDirectory)
{
String column=null;
SimpleStringTokenizer t = new SimpleStringTokenizer(storeLogLine, '\t');
try
{

hasPageFileEntry = false;
t.nextToken();
url = t.nextToken();
column = "isFrame";
isFrame = Integer.parseInt(t.nextToken());
t.nextToken(); // anchor
column = "resultCode";
resultCode = Integer.parseInt(t.nextToken());
mimeType = t.nextToken();
column = "size";
size = Integer.parseInt(t.nextToken());
title = t.nextToken();
if(size > 0)
{
column = "pageFileNo";
pageFileNo = Integer.parseInt(t.nextToken());
column = "pageFileOffset";
pageFileOffset = Integer.parseInt(t.nextToken());
this.pageFileDirectory = pageFileDirectory;
hasPageFileEntry = true;
}
}
catch(NumberFormatException e) // possibly tab characters in title. ignore
{
//System.out.println(e + " at " + url + " in column " + column);
}
}

public InputStream getInputStream() throws IOException
{
if(hasPageFileEntry)
{
return new PageFileInputStream();
}
else return null;
}

}

BufferedReader reader;
boolean isOpen = false;
File storeLog;

/**
*
* @param storeLog location of store.log from LogStorage. pagefile_xy.pfl
* must be in the same directory
* @throws IOException
*/
public StoreLogFile(File storeLog) throws IOException
{
this.storeLog = storeLog;
reader = new BufferedReader(new FileReader(storeLog));
isOpen = true; // unless exception

}

public boolean hasNext()
{
try
{
reader.mark(1000);
if(reader.readLine() != null)
{
reader.reset();
return true;
}
else
{
return false;
}
}
catch(IOException e)
{
throw new RuntimeException("IOException occured");
}
}

/**
* @return a StoreLogFile.PageFileEntry with the current file
* @throws IOException
*/
public Object next()
{
try
{
return new PageFileEntry(reader.readLine(), storeLog.getParentFile());
}
catch(IOException e)
{
throw new RuntimeException("IOException occured");
}
}




// static SimpleLogger log;
// static PageFileEntry entry;
// static ArrayList foundURLs;
// static URL base;
// static URL contextUrl;
//
// static void test1(StoreLogFile store) throws IOException
// {
// while(store.hasNext())
// {
// PageFileEntry entry = store.next();
// if(entry.mimeType.equals("text/plain") && entry.hasPageFileEntry)
// {
// BufferedReader r = new BufferedReader(new InputStreamReader(entry.getInputStream()));
// String l;
// while((l = r.readLine()) != null)
// {
// System.out.println(entry.url + " >> " + l);
// }
// r.close();
// }
// //System.out.println(entry.title);
// }
// }
// static void test2(StoreLogFile store) throws Exception
// {
// MessageHandler msgH = new MessageHandler();
// log = new SimpleLogger("errors.log");
// msgH.addListener(new URLVisitedFilter(log, 100000));
// final de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(1000);
// hm.setHostResolver(new HostResolver());
//
// while(store.hasNext())
// {
// entry = store.next();
// foundURLs = new ArrayList();
// if(entry.mimeType.startsWith("text/html") && entry.hasPageFileEntry)
// {
// Tokenizer t = new Tokenizer();
// base = new URL(entry.url);
// contextUrl = new URL(entry.url);
//
// t.setLinkHandler(new LinkHandler()
// {
//
// public void handleLink(String link, String anchor, boolean isFrame)
// {
// try
// {
// // cut out Ref part
//
//
// int refPart = link.indexOf("#");
// //System.out.println(link);
// if (refPart == 0)
// {
// return;
// }
// else if (refPart > 0)
// {
// link = link.substring(0, refPart);
// }
//
// URL url = null;
// if (link.startsWith("http:"))
// {
// // distinguish between absolute and relative URLs
//
// url = new URL(link);
// }
// else
// {
// // relative url
// url = new URL(base, link);
// }
//
// URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hm.getHostResolver());
//
// String urlString = urlMessage.getURLString();
//
// foundURLs.add(urlMessage);
// //messageHandler.putMessage(new actURLMessage(url)); // put them in the very end
// }
// catch (MalformedURLException e)
// {
// //log.log("malformed url: base:" + base + " -+- link:" + link);
// log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
// }
// catch (Exception e)
// {
// log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
// // e.printStackTrace();
// }
//
// }
//
//
// /**
// * called when a BASE tag was found
// *
// * @param base the HREF attribute
// */
// public void handleBase(String baseString)
// {
// try
// {
// base = new URL(baseString);
// }
// catch (MalformedURLException e)
// {
// log.log("warning: " + e.getClass().getName() + ": " + e.getMessage() + " while converting '" + base + "' to URL in document " + contextUrl);
// }
// }
//
// public void handleTitle(String value)
// {}
//
//
// });
// t.parse(new BufferedReader(new InputStreamReader(entry.getInputStream())));
// msgH.putMessages(foundURLs);
// }
//
// }
//
// }
//
// public static void main(String[] args) throws Exception
// {
// StoreLogFile store = new StoreLogFile(new File("c:/java/jakarta-lucene-sandbox/contributions/webcrawler-LARM/logs/store.log"));
// test2(store);
// }

}





--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>