Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher URLScopeFilter.java
cmarschner 2002/10/22 08:21:00

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
URLScopeFilter.java
Log:
takes normalized URL string for comparisons; added logging

Revision Changes Path
1.3 +9 -4 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java

Index: URLScopeFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- URLScopeFilter.java 22 May 2002 23:09:17 -0000 1.2
+++ URLScopeFilter.java 22 Oct 2002 15:21:00 -0000 1.3
@@ -57,6 +57,7 @@
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Pattern;
+import de.lanlab.larm.util.*;

/**
* filter class. Tries to match a regular expression with an incoming URL
@@ -77,11 +78,13 @@
private Pattern pattern;
private Perl5Matcher matcher;
private Perl5Compiler compiler;
+ SimpleLogger log;

- public URLScopeFilter()
+ public URLScopeFilter(SimpleLogger log)
{
matcher = new Perl5Matcher();
compiler = new Perl5Compiler();
+ this.log = log;
}

public String getRexString()
@@ -108,7 +111,7 @@
{
if(message instanceof URLMessage)
{
- String urlString = ((URLMessage)message).toString();
+ String urlString = ((URLMessage)message).getNormalizedURLString();
int length = urlString.length();
char buffer[] = new char[length];
urlString.getChars(0,length,buffer,0);
@@ -117,8 +120,10 @@
boolean match = matcher.matches(buffer, pattern);
if(!match)
{
- //System.out.println("not in Scope: " + urlString);
+ //log.log("URLScopeFilter: not in scope: " + urlString);
+ log.log(message.toString());
filtered++;
+
return null;
}
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>