Mailing List Archive: cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestPhrasePrefixQuery.java

otis 2002/07/18 07:39:58

Added: src/java/org/apache/lucene/index MultipleTermPositions.java
src/java/org/apache/lucene/search PhrasePrefixQuery.java
src/test/org/apache/lucene/search TestPhrasePrefixQuery.java
Log:
- Classes that provide support for queries such as "microsoft app*".
Submitted by: Anders Nielsen
Reviewed by: otis

Revision Changes Path
1.1 jakarta-lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java

Index: MultipleTermPositions.java
===================================================================
package org.apache.lucene.index;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.util.PriorityQueue;

/**
* Describe class <code>MultipleTermPositions</code> here.
*
* @author Anders Nielsen
* @version 1.0
*/
public class MultipleTermPositions
implements TermPositions
{
private final static class TermPositionsQueue
extends PriorityQueue
{
TermPositionsQueue(List termPositions)
throws IOException
{
initialize(termPositions.size());

Iterator i = termPositions.iterator();
while (i.hasNext())
{
TermPositions tp = (TermPositions)i.next();
if (tp.next())
put(tp);
}
}

final TermPositions peek()
{
return (TermPositions)top();
}

public final boolean lessThan(Object a, Object b)
{
return ((TermPositions)a).doc() < ((TermPositions)b).doc();
}
}

private final static class IntQueue
{
private int _arraySize = 16;

private int _index = 0;
private int _lastIndex = 0;

private int[] _array = new int[_arraySize];

final void add(int i)
{
if (_lastIndex == _arraySize)
growArray();

_array[_lastIndex++] = i;
}

final int next()
{
return _array[_index++];
}

final void sort()
{
Arrays.sort(_array, _index, _lastIndex);
}

final void clear()
{
_index = 0;
_lastIndex = 0;
}

final int size()
{
return (_lastIndex-_index);
}

private void growArray()
{
int[] newArray = new int[_arraySize*2];
System.arraycopy(_array, 0, newArray, 0, _arraySize);
_array = newArray;
_arraySize *= 2;
}
}

private int _doc;
private int _freq;

private TermPositionsQueue _termPositionsQueue;
private IntQueue _posList;

/**
* Creates a new <code>MultipleTermPositions</code> instance.
*
* @param indexReader an <code>IndexReader</code> value
* @param terms a <code>Term[]</code> value
* @exception IOException if an error occurs
*/
public MultipleTermPositions(IndexReader indexReader, Term[] terms)
throws IOException
{
List termPositions = new LinkedList();

for (int i=0; i<terms.length; i++)
termPositions.add(indexReader.termPositions(terms[i]));

_termPositionsQueue = new TermPositionsQueue(termPositions);
_posList = new IntQueue();
}

/**
* Describe <code>next</code> method here.
*
* @return a <code>boolean</code> value
* @exception IOException if an error occurs
* @see TermDocs#next()
*/
public final boolean next()
throws IOException
{
if (_termPositionsQueue.size() == 0)
return false;

_posList.clear();
_doc = _termPositionsQueue.peek().doc();

TermPositions tp;
do
{
tp = _termPositionsQueue.peek();

for (int i=0; i<tp.freq(); i++)
_posList.add(tp.nextPosition());

if (tp.next())
_termPositionsQueue.adjustTop();
else
{
_termPositionsQueue.pop();
tp.close();
}
}
while (_termPositionsQueue.size() > 0 && _termPositionsQueue.peek().doc() == _doc);

_posList.sort();
_freq = _posList.size();

return true;
}

/**
* Describe <code>nextPosition</code> method here.
*
* @return an <code>int</code> value
* @exception IOException if an error occurs
* @see TermPositions#nextPosition()
*/
public final int nextPosition()
throws IOException
{
return _posList.next();
}

/**
* Describe <code>skipTo</code> method here.
*
* @param target an <code>int</code> value
* @return a <code>boolean</code> value
* @exception IOException if an error occurs
* @see TermDocs#skipTo(int)
*/
public final boolean skipTo(int target)
throws IOException
{
while (target > _termPositionsQueue.peek().doc())
{
TermPositions tp = (TermPositions)_termPositionsQueue.pop();

if (tp.skipTo(target))
_termPositionsQueue.put(tp);
else
tp.close();
}

return next();
}

/**
* Describe <code>doc</code> method here.
*
* @return an <code>int</code> value
* @see TermDocs#doc()
*/
public final int doc()
{
return _doc;
}

/**
* Describe <code>freq</code> method here.
*
* @return an <code>int</code> value
* @see TermDocs#freq()
*/
public final int freq()
{
return _freq;
}

/**
* Describe <code>close</code> method here.
*
* @exception IOException if an error occurs
* @see TermDocs#close()
*/
public final void close()
throws IOException
{
while (_termPositionsQueue.size() > 0)
((TermPositions)_termPositionsQueue.pop()).close();
}

/**
* Describe <code>seek</code> method here.
*
* @param arg0 a <code>Term</code> value
* @exception IOException if an error occurs
* @see TermDocs#seek(Term)
*/
public void seek(Term arg0)
throws IOException
{
throw new UnsupportedOperationException();
}

/**
* Describe <code>read</code> method here.
*
* @param arg0 an <code>int[]</code> value
* @param arg1 an <code>int[]</code> value
* @return an <code>int</code> value
* @exception IOException if an error occurs
* @see TermDocs#read(int[], int[])
*/
public int read(int[] arg0, int[] arg1)
throws IOException
{
throw new UnsupportedOperationException();
}
}

1.1 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java

Index: PhrasePrefixQuery.java
===================================================================
package org.apache.lucene.search;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultipleTermPositions;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.Query;

/**
* PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
* method {@link add(Term[])}.
* To use this class, to search for the phrase "Microsoft app*" first use
* add(Term) on the term "Microsoft", then find all terms that has "app" as
* prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]
* terms) to add them to the query.
*
* @author Anders Nielsen
* @version 1.0
*/
public class PhrasePrefixQuery
extends Query
{
private String _field;
private ArrayList _termArrays = new ArrayList();

private float _idf = 0.0f;
private float _weight = 0.0f;

private int _slop = 0;

/**
* Creates a new <code>PhrasePrefixQuery</code> instance.
*
*/
public PhrasePrefixQuery()
{
}

/**
* Describe <code>setSlop</code> method here.
*
* @param s an <code>int</code> value
*/
public void setSlop(int s)
{
_slop = s;
}

/**
* Describe <code>getSlop</code> method here.
*
* @return an <code>int</code> value
*/
public int getSlop()
{
return _slop;
}

/**
* Describe <code>add</code> method here.
*
* @param term a <code>Term</code> value
*/
public void add(Term term)
{
add(new Term[]{term});
}

/**
* Describe <code>add</code> method here.
*
* @param terms a <code>Term[]</code> value
*/
public void add(Term[] terms)
{
if (_termArrays.size() == 0)
_field = terms[0].field();

for (int i=0; i<terms.length; i++)
{
if (terms[i].field() != _field)
{
throw new IllegalArgumentException(
"All phrase terms must be in the same field (" + _field + "): "
+ terms[i]);
}
}

_termArrays.add(terms);
}

Scorer scorer(IndexReader reader)
throws IOException
{
if (_termArrays.size() == 0) // optimize zero-term case
return null;

if (_termArrays.size() == 1) // optimize one-term case
{
Term[] terms = (Term[])_termArrays.get(0);

BooleanQuery boq = new BooleanQuery();
for (int i=0; i<terms.length; i++)
boq.add(new TermQuery(terms[i]), false, false);

return boq.scorer(reader);
}

TermPositions[] tps = new TermPositions[_termArrays.size()];
for (int i=0; i<tps.length; i++)
{
Term[] terms = (Term[])_termArrays.get(i);

TermPositions p;
if (terms.length > 1)
p = new MultipleTermPositions(reader, terms);
else
p = reader.termPositions(terms[0]);

if (p == null)
return null;

tps[i] = p;
}

if (_slop == 0)
return new ExactPhraseScorer(tps, reader.norms(_field), _weight);
else
return new SloppyPhraseScorer(tps, _slop, reader.norms(_field), _weight);
}

float sumOfSquaredWeights(Searcher searcher)
throws IOException
{
Iterator i = _termArrays.iterator();
while (i.hasNext())
{
Term[] terms = (Term[])i.next();
for (int j=0; j<terms.length; j++)
_idf += Similarity.idf(terms[j], searcher);
}

_weight = _idf * boost;
return _weight * _weight;
}

void normalize(float norm)
{
_weight *= norm;
_weight *= _idf;
}

/**
* Describe <code>toString</code> method here.
*
* This method assumes that the first term in a array of terms is the
* prefix for the whole array. That might not necessarily be so.
*
* @param f a <code>String</code> value
* @return a <code>String</code> value
*/
public final String toString(String f)
{
StringBuffer buffer = new StringBuffer();
if (!_field.equals(f))
{
buffer.append(_field);
buffer.append(":");
}

buffer.append("\"");
Iterator i = _termArrays.iterator();
while (i.hasNext())
{
Term[] terms = (Term[])i.next();
buffer.append(terms[0].text() + (terms.length > 0 ? "*" : ""));
}
buffer.append("\"");

if (_slop != 0)
{
buffer.append("~");
buffer.append(_slop);
}

if (boost != 1.0f)
{
buffer.append("^");
buffer.append(Float.toString(boost));
}

return buffer.toString();
}
}

1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java

Index: TestPhrasePrefixQuery.java
===================================================================
package org.apache.lucene.search;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import junit.framework.TestCase;

import java.io.IOException;
import java.util.LinkedList;

/**
* This class tests PhrasePrefixQuery class.
*
* @author Otis Gospodnetic
* @version $Id: TestPhrasePrefixQuery.java,v 1.1 2002/07/18 14:39:58 otis Exp $
*/
public class TestPhrasePrefixQuery
extends TestCase
{
public TestPhrasePrefixQuery(String name)
{
super(name);
}

/**
*
*/
public void testPhrasePrefix()
throws IOException
{
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true);
Document doc1 = new Document();
Document doc2 = new Document();
Document doc3 = new Document();
Document doc4 = new Document();
doc1.add(Field.Text("body", "blueberry pie"));
doc2.add(Field.Text("body", "blueberry pizza"));
doc3.add(Field.Text("body", "blueberry chewing gum"));
doc4.add(Field.Text("body", "picadelly circus"));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.addDocument(doc4);
writer.optimize();
writer.close();

IndexSearcher searcher = new IndexSearcher(indexStore);

PhrasePrefixQuery query1 = new PhrasePrefixQuery();
PhrasePrefixQuery query2 = new PhrasePrefixQuery();
query1.add(new Term("body", "blueberry"));
query2.add(new Term("body", "strawberry"));

LinkedList termsWithPrefix = new LinkedList();
IndexReader ir = IndexReader.open(indexStore);

// this TermEnum gives "picadelly", "pie" and "pizza".
TermEnum te = ir.terms(new Term("body", "pi*"));
do {
termsWithPrefix.add(te.term());
} while (te.next());
query1.add((Term[])termsWithPrefix.toArray(new Term[0]));
query2.add((Term[])termsWithPrefix.toArray(new Term[0]));

Hits result;
result = searcher.search(query1);
assertEquals(2, result.length());

result = searcher.search(query2);
assertEquals(0, result.length());
}
}

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>