Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net HostInfo.java HostManager.java URLNormalizer.java
cmarschner 2002/06/17 07:00:14

Added: contributions/webcrawler-LARM/src/de/lanlab/larm/net
HostInfo.java HostManager.java URLNormalizer.java
Log:
moved HostInfo/HostManager to larm.net package; added URLNormalizer

Revision Changes Path
1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java

Index: HostInfo.java
===================================================================
/*
* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package de.lanlab.larm.net;

import java.util.HashMap;
import java.net.*;
import de.lanlab.larm.util.CachingQueue;
import de.lanlab.larm.util.Queue;
import java.util.LinkedList;
import de.lanlab.larm.fetcher.Message;

/**
* contains information about a host. If a host doesn't respond too often, it's
* excluded from the crawl. This class is used by the HostManager
*
* @author Clemens Marschner
* @created 16. Februar 2002
* @version $Id: HostInfo.java,v 1.1 2002/06/17 14:00:13 cmarschner Exp $
*/
public class HostInfo
{
final static String[] emptyKeepOutDirectories = new String[0];

int id;

int healthyCount = 5;

// five strikes, and you're out
boolean isReachable = true;

boolean robotTxtChecked = false;

String[] disallows;

// robot exclusion
boolean isLoadingRobotsTxt = false;

Queue queuedRequests = null;

// robot exclusion
String hostName;


/**
* Description of the Method
*/
public void removeQueue()
{
queuedRequests = null;
}


/**
* Gets the id attribute of the HostInfo object
*
* @return The id value
*/
public int getId()
{
return id;
}


/**
* Description of the Method
*
* @param message Description of the Parameter
*/
public void insertIntoQueue(Message message)
{
queuedRequests.insert(message);
}


/**
* Gets the hostName attribute of the HostInfo object
*
* @return The hostName value
*/
public String getHostName()
{
return hostName;
}


/**
* Gets the queueSize. No error checking is done when the queue is null
*
* @return The queueSize value
*/
public int getQueueSize()
{
return queuedRequests.size();
}


/**
* gets last entry from queue. No error checking is done when the queue is null
*
* @return Description of the Return Value
*/
public Message removeFromQueue()
{
return (Message) queuedRequests.remove();
}


//LinkedList synonyms = new LinkedList();

/**
* Constructor for the HostInfo object
*
* @param hostName Description of the Parameter
* @param id Description of the Parameter
*/
public HostInfo(String hostName, int id)
{
this.id = id;
this.disallows = HostInfo.emptyKeepOutDirectories;
this.hostName = hostName;
}


/**
* is this host reachable and responding?
*
* @return The healthy value
*/
public boolean isHealthy()
{
return (healthyCount > 0) && isReachable;
}


/**
* signals that the host returned with a bad request of whatever type
*/
public void badRequest()
{
healthyCount--;
}


/**
* Sets the reachable attribute of the HostInfo object
*
* @param reachable The new reachable value
*/
public void setReachable(boolean reachable)
{
isReachable = reachable;
}


/**
* Gets the reachable attribute of the HostInfo object
*
* @return The reachable value
*/
public boolean isReachable()
{
return isReachable;
}


/**
* Gets the robotTxtChecked attribute of the HostInfo object
*
* @return The robotTxtChecked value
*/
public boolean isRobotTxtChecked()
{
return robotTxtChecked;
}


/**
* must be synchronized externally
*
* @return The loadingRobotsTxt value
*/
public boolean isLoadingRobotsTxt()
{
return this.isLoadingRobotsTxt;
}


/**
* Sets the loadingRobotsTxt attribute of the HostInfo object
*
* @param isLoading The new loadingRobotsTxt value
*/
public void setLoadingRobotsTxt(boolean isLoading)
{
this.isLoadingRobotsTxt = isLoading;
if (isLoading)
{
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
}

}


/**
* Sets the robotsChecked attribute of the HostInfo object
*
* @param isChecked The new robotsChecked value
* @param disallows The new robotsChecked value
*/
public void setRobotsChecked(boolean isChecked, String[] disallows)
{
this.robotTxtChecked = isChecked;
if (disallows != null)
{
this.disallows = disallows;
}
else
{
this.disallows = emptyKeepOutDirectories;
}

}


/**
* Gets the allowed attribute of the HostInfo object
*
* @param path Description of the Parameter
* @return The allowed value
*/
public synchronized boolean isAllowed(String path)
{
// assume keepOutDirectories is pretty short
// assert disallows != null
int length = disallows.length;
for (int i = 0; i < length; i++)
{
if (path.startsWith(disallows[i]))
{
return false;
}
}
return true;
}

}



1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java

Index: HostManager.java
===================================================================
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

package de.lanlab.larm.net;

import java.util.HashMap;

/**
* Description of the Class
*
* @author Administrator
* @created 16. Februar 2002
* @version $Id: HostManager.java,v 1.1 2002/06/17 14:00:13 cmarschner Exp $
*/
public class HostManager
{
HashMap hosts;
static int hostCount = 0;


/**
* Constructor for the HostInfo object
*
* @param initialSize Description of the Parameter
*/
public HostManager(int initialCapacity)
{
hosts = new HashMap(initialCapacity);
}


/**
* Description of the Method
*
* @param hostName Description of the Parameter
* @return Description of the Return Value
*/
public HostInfo put(String hostName)
{
if (!hosts.containsKey(hostName))
{
int hostID;
synchronized (this)
{
hostID = hostCount++;
}
HostInfo hi = new HostInfo(hostName,hostID);
hosts.put(hostName, hi);
//System.out.println("hostManager: + " + hostName);
if(!hostName.equals(hostName.toLowerCase()))
{
try
{
throw new Exception();
}
catch(Exception e)
{
e.printStackTrace();
}
}
return hi;
}
return (HostInfo)hosts.get(hostName);
/*else
{
hostID = hosts.get()
}
// assert hostID != -1;
return hostID;*/

}


/**
* Gets the hostID attribute of the HostInfo object
*
* @param hostName Description of the Parameter
* @return The hostID value
*/
public HostInfo getHostInfo(String hostName)
{
HostInfo hi = (HostInfo)hosts.get(hostName);
if(hi == null)
{
return put(hostName);
}
return hi;
}

public int getSize()
{
return hosts.size();
}

public HostInfo addSynonym(String hostName, String synonym)
{
HostInfo info = getHostInfo(hostName);
hosts.put(synonym, info);
return info;
}


}



1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java

Index: URLNormalizer.java
===================================================================
package de.lanlab.larm.net;
/*
* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
import java.net.*;


/**
* Description of the Class
*
* @author Administrator
* @created 14. Juni 2002
*/
public class URLNormalizer
{
final static int NP_SLASH = 1;
final static int NP_CHAR = 2;
final static int NP_PERCENT = 3;
final static int NP_POINT = 4;
final static int NP_HEX = 5;

/**
* contains hex codes for characters in lowercase uses char arrays instead
* of strings for faster processing
*/
protected static char[][] charMap = {
{'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0', '4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'}, {'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%', '0', 'F'},
{'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1', '4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'}, {'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%', '1', 'F'},
{'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
{'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%', '3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3', 'F'},
{'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'},
{'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'},
{'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8', '4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'}, {'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%', '8', 'F'},
{'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9', '4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'}, {'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%', '9', 'F'},
{'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A', '4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'}, {'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%', 'A', 'F'},
{'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B', '4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'}, {'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%', 'B', 'F'},
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'D', 'F'},
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'F', 'F'},
};


/**
* Description of the Method
*
* @param path Description of the Parameter
* @return Description of the Return Value
* @exception IOException Description of the Exception
*/
protected static String normalizePath(String path)
throws IOException
{
// rule 1: if the path is empty, return "/"
if (path.length() == 0)
{
return "/";
}

// Finite State Machine to convert characters to lowercase, remove "//" and "/./"
// and make sure that all characters are escaped in a uniform way, i.e.
// {" ", "+", "%20"} -> "%20"

StringBuffer w = new StringBuffer((int) (path.length() * 1.5));

int status = NP_CHAR;

int pos = 0;
int length = path.length();
char savedChar = '?';
int hexChar = '?';
int pathPos = -1; // position of last "/"
int questionPos = -1; // assert length >0
boolean isInQuery = false; // question mark reached?

while (pos < length)
{
char c = path.charAt(pos++);
try
{
switch (status)
{
case NP_SLASH:
if (c == '/')
{
// ignore subsequent slashes
}
else if (c == '.')
{
status = NP_POINT;
}
else if (c == '%')
{
status = NP_PERCENT;
}
else
{
pos--;
status = NP_CHAR;
}
break;
case NP_POINT:
if (c == '/')
{
// ignore
}
else if (c == '.')
{
// ignore; this shouldn't happen
}
else
{
w.append('.');
pos--;
status = NP_SLASH;
}
break;
case NP_PERCENT:
if (c >= '0' && c <= '9')
{
hexChar = (c - '0') << 4;
}
else if (c >= 'a' && c <= 'f')
{
hexChar = (c - 'a' + 10) << 4;
}
else if (c >= 'A' && c <= 'F')
{
hexChar = (c - 'A' + 10) << 4;
}
else
{
w.append(charMap['%']);
w.append(charMap[c]);
break;
}
savedChar = c;
status = NP_HEX;
break;
case NP_HEX:
if (c >= '0' && c <= '9')
{
hexChar |= (c - '0');
}
else if (c >= 'a' && c <= 'f')
{
hexChar |= (c - 'a' + 10);
}
else if (c >= 'A' && c <= 'F')
{
hexChar |= (c - 'A' + 10);
}
else
{
w.append(charMap['%']);
w.append(charMap[savedChar]);
w.append(charMap[c]);
break;
}
w.append(charMap[hexChar]);
status = NP_CHAR;
break;
case NP_CHAR:
switch (c)
{
case '%':
status = NP_PERCENT;
break;
case '/':
if(!isInQuery)
{
w.append(c);
pathPos = w.length(); // points to the char. after "/"
status = NP_SLASH;
}
else
{
w.append(charMap[c]);
}
break;
case '?':
if(!isInQuery)
{
if(pathPos == -1)
{
w.append('/');
pathPos = w.length();
}
questionPos = w.length(); // points to the char at "?"
isInQuery = true;
}
else
{
w.append(charMap[c]);
break;
}
case '&':
case ';':
case '@':
//case ':':
case '=':
w.append(c);
break;
case '+':
w.append("%20");
break;
default:
w.append(charMap[c]);
break;
}
}

}
catch (ArrayIndexOutOfBoundsException e)
{
// we encountered a unicode character >= 0x00ff
// write UTF-8 to distinguish it from other characters
// note that this does NOT lead to a pure UTF-8 URL since we
// write 0x80 <= c <= 0xff as one-byte strings
/*
* if (ch <= 0x007f) { // other ASCII
* sbuf.append(hex[ch]);
* } else
*/
// note that we ignore the case that we receive "%" + unicode + c
// (status = NP_HEX + Exception when writing savedchar); in that case
// only the second character is written. we consider this to be very
// unlikely

// see http://www.w3.org/International/O-URL-code.html
if (c <= 0x07FF)
{
// non-ASCII <= 0x7FF
w.append(charMap[0xc0 | (c >> 6)]);
w.append(charMap[0x80 | (c & 0x3F)]);
}
else
{
// 0x7FF < c <= 0xFFFF
w.append(charMap[0xe0 | (c >> 12)]);
w.append(charMap[0x80 | ((c >> 6) & 0x3F)]);
w.append(charMap[0x80 | (c & 0x3F)]);
}
}
}

// rule 3: delete index.* or default.*

if(questionPos == -1) // no query
{
questionPos = w.length();
}
else
{
if(questionPos == w.length()-1)
{
// empty query. assert questionPos > 0
w.deleteCharAt(questionPos);
}
}
if(pathPos == -1) // no query
{
pathPos = 0;
}
if(questionPos > pathPos)
{
String file = w.substring(pathPos, questionPos);
{
//System.out.println("file: " + file);
if(file.startsWith("index.") || file.startsWith("default."))
{
w.delete(pathPos, questionPos); // delete default page to avoid ambiguities
}
}
}
return w.toString();
}


/**
* Description of the Method
*
* @param host Description of the Parameter
* @return Description of the Return Value
*/
protected static String normalizeHost(HostManager hostManager, String host)
{
return hostManager.getHostInfo(host.toLowerCase()).getHostName();
}

/*
HostManager hostManager;
*/

/**
* Constructor for the URLNormalizer object
*
* @param hostManager Description of the Parameter
*/
/* public URLNormalizer(HostManager hostManager)
{
this.hostManager = hostManager;
}*/


/**
* Description of the Method
*
* @param u Description of the Parameter
* @return Description of the Return Value
* @exception IOException Description of the Exception
* @exception MalformedURLException Description of the Exception
*/
public static URL normalize(URL u, HostManager hostManager)
{
if (u.getProtocol().equals("http"))
{
try
{
int port = u.getPort();
/*URL url =*/
return new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()), port == 80 ? -1 : port, normalizePath(u.getFile()));
/*if(!u.equals(url))
{
System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
}
return url;*/
}
catch(MalformedURLException e)
{
System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()");
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
}
catch(IOException e)
{
System.out.println("assertion failed: IOException in URLNormalizer.normalize()");
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
}

//return url
}
else
{
return u;
}
}

public static void main(String[] args) throws Exception
{
HostManager hm = new HostManager(10);
hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"), hm));
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"), hm));
URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
System.out.println("host: " + u.getHost());
System.out.println("port: " + u.getPort());
System.out.println(URLNormalizer.normalize(u, hm));



}
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>