Mailing List Archive: cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestPositionIncrement.java

cutting 2002/08/05 10:15:00

Modified: src/java/org/apache/lucene/analysis Token.java
src/java/org/apache/lucene/index DocumentWriter.java
Added: src/test/org/apache/lucene/search TestPositionIncrement.java
Log:
Added support for Token.setPositionIncrement(int).

Revision Changes Path
1.2 +37 -0 jakarta-lucene/src/java/org/apache/lucene/analysis/Token.java

Index: Token.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/Token.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Token.java 18 Sep 2001 16:29:50 -0000 1.1
+++ Token.java 5 Aug 2002 17:14:59 -0000 1.2
@@ -74,6 +74,8 @@
int endOffset; // end in source text
String type = "word"; // lexical type

+ private int positionIncrement = 1;
+
/** Constructs a Token with the given term text, and start & end offsets.
The type defaults to "word." */
public Token(String text, int start, int end) {
@@ -89,6 +91,41 @@
endOffset = end;
type = typ;
}
+
+ /** Set the position increment. This determines the position of this token
+ * relative to the previous Token in a {@link TokenStream}, used in phrase
+ * searching.
+ *
+ * <p>The default value is one.
+ *
+ * <p>Two common uses for this are:<ul>
+ *
+ * <li>Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., when a word has multiple stems. This way searches for
+ * phrases including either stem will match this occurence. In this case,
+ * all but the first stem's increment should be set to zero: the increment of
+ * the first instance should be one.
+ *
+ * <li>Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across stop words,
+ * then one could build a stop word filter that removes stop words and also
+ * sets the increment to the number of stop words removed before each
+ * non-stop word.
+ *
+ * </ul>
+ * @see TermPositions
+ */
+ public void setPositionIncrement(int positionIncrement) {
+ if (positionIncrement < 0)
+ throw new IllegalArgumentException
+ ("Increment must be positive: " + positionIncrement);
+ this.positionIncrement = positionIncrement;
+ }
+
+ /** Returns the position increment of this Token.
+ * @see #setPositionIncrement
+ */
+ public int getPositionIncrement() { return positionIncrement; }

/** Returns the Token's term text. */
public final String termText() { return termText; }

1.3 +1 -0 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java

Index: DocumentWriter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- DocumentWriter.java 29 Jul 2002 19:11:15 -0000 1.2
+++ DocumentWriter.java 5 Aug 2002 17:15:00 -0000 1.3
@@ -165,6 +165,7 @@
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
for (Token t = stream.next(); t != null; t = stream.next()) {
+ position += (t.getPositionIncrement() - 1);
addPosition(fieldName, t.termText(), position++);
if (position > maxFieldLength) break;
}

1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java

Index: TestPositionIncrement.java
===================================================================
package org.apache.lucene.search;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.Reader;
import java.io.IOException;

import junit.framework.TestCase;

/** Document boost unit test.
*
* @author Doug Cutting
* @version $Revision: 1.1 $
*/
public class TestPositionIncrement extends TestCase {
public TestPositionIncrement(String name) {
super(name);
}

public static void test() throws Exception {
Analyzer analyzer = new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new TokenStream() {
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
private int i = 0;
public Token next() throws IOException {
if (i == TOKENS.length)
return null;
Token t = new Token(TOKENS[i], i, i);
t.setPositionIncrement(INCREMENTS[i]);
i++;
return t;
}
};
}
};
RAMDirectory store = new RAMDirectory();
IndexWriter writer = new IndexWriter(store, analyzer, true);
Document d = new Document();
d.add(Field.Text("field", "bogus"));
writer.addDocument(d);
writer.optimize();
writer.close();

IndexSearcher searcher = new IndexSearcher(store);
PhraseQuery q;
Hits hits;

q = new PhraseQuery();
q.add(new Term("field","1"));
q.add(new Term("field","2"));
hits = searcher.search(q);
assertEquals(0, hits.length());

q = new PhraseQuery();
q.add(new Term("field","2"));
q.add(new Term("field","3"));
hits = searcher.search(q);
assertEquals(1, hits.length());

q = new PhraseQuery();
q.add(new Term("field","3"));
q.add(new Term("field","4"));
hits = searcher.search(q);
assertEquals(0, hits.length());

q = new PhraseQuery();
q.add(new Term("field","2"));
q.add(new Term("field","4"));
hits = searcher.search(q);
assertEquals(1, hits.length());

q = new PhraseQuery();
q.add(new Term("field","3"));
q.add(new Term("field","5"));
hits = searcher.search(q);
assertEquals(1, hits.length());

q = new PhraseQuery();
q.add(new Term("field","4"));
q.add(new Term("field","5"));
hits = searcher.search(q);
assertEquals(1, hits.length());

q = new PhraseQuery();
q.add(new Term("field","2"));
q.add(new Term("field","5"));
hits = searcher.search(q);
assertEquals(0, hits.length());

}
}

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>