Mailing List Archive: cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestDocBoost.java

cutting 2002/07/29 12:11:15

Modified: . CHANGES.txt
src/java/org/apache/lucene/document Document.java Field.java
src/java/org/apache/lucene/index DocumentWriter.java
IndexReader.java
src/java/org/apache/lucene/search PhrasePrefixQuery.java
PhraseScorer.java Similarity.java TermScorer.java
Added: src/test/org/apache/lucene/search TestDocBoost.java
Log:
msg.txt

Revision Changes Path
1.28 +12 -1 jakarta-lucene/CHANGES.txt

Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.27
retrieving revision 1.28
diff -u -r1.27 -r1.28
--- CHANGES.txt 26 Jul 2002 17:32:54 -0000 1.27
+++ CHANGES.txt 29 Jul 2002 19:11:14 -0000 1.28
@@ -48,6 +48,17 @@
stems from nouns and verbs derived from the same word.
(gschwarz)

+ 12. Added support for boosting the score of documents and fields via
+ the new methods Document.setBoost(float) and Field.setBoost(float).
+
+ Note: This changes the encoding of an indexed value. Indexes
+ should be re-created from scratch in order for search scores to
+ be correct. With the new code and an old index, searches will
+ yield very large scores for shorter fields, and very small scores
+ for longer fields. Once the index is re-created, scores will be
+ as before. (cutting)
+
+
1.2 RC6

1. Changed QueryParser.jj to have "?" be a special character which

1.3 +32 -0 jakarta-lucene/src/java/org/apache/lucene/document/Document.java

Index: Document.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Document.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Document.java 17 Jul 2002 21:54:38 -0000 1.2
+++ Document.java 29 Jul 2002 19:11:14 -0000 1.3
@@ -55,6 +55,8 @@
*/

import java.util.Enumeration;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Hits;

/** Documents are the unit of indexing and search.
*
@@ -66,9 +68,39 @@

public final class Document implements java.io.Serializable {
DocumentFieldList fieldList = null;
+ private float boost = 1.0f;

/** Constructs a new document with no fields. */
public Document() {}
+
+
+ /** Sets a boost factor for hits on any field of this document. This value
+ * will be multiplied into the score of all hits on this document.
+ *
+ * Values are multiplied into the value of {@link Field#getBoost()} of
+ * each field in this document. Thus, this method in effect sets a default
+ * boost for the fields of this document.
+ *
+ * @see Field#setBoost(float)
+ */
+ public void setBoost(float boost) {
+ this.boost = boost;
+ }
+
+ /** Returns the boost factor for hits on any field of this document.
+ *
+ * The default value is 1.0.
+ *
+ * Note: This value is not stored directly with the document in the index.
+ * Documents returned from {@link IndexReader#document(int)} and {@link
+ * Hits#doc(int)} may thus not have the same value present as when this
+ * document was indexed.
+ *
+ * @see #setBoost(float)
+ */
+ public float getBoost() {
+ return boost;
+ }

/** Adds a field to a document. Several fields may be added with
* the same name. In this case, if the fields are indexed, their text is

1.7 +40 -0 jakarta-lucene/src/java/org/apache/lucene/document/Field.java

Index: Field.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- Field.java 17 Jul 2002 21:54:38 -0000 1.6
+++ Field.java 29 Jul 2002 19:11:14 -0000 1.7
@@ -56,6 +56,9 @@

import java.io.Reader;
import java.util.Date;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.Hits;

/**
A field is a section of a Document. Each field has two parts, a name and a
@@ -72,6 +75,43 @@
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
+
+ private float boost = 1.0f;
+
+ /** Sets the boost factor hits on this field. This value will be
+ * multiplied into the score of all hits on this this field of this
+ * document.
+ *
+ * The boost is multiplied by {@link Document#getBoost()} of the document
+ * containing this field. If a document has multiple fields with the same
+ * name, all such values are multiplied together. This product is then
+ * multipled by the value {@link Similarity#normalizeLength(int)}, and
+ * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
+ * index. One should attempt to ensure that this product does not overflow
+ * the range of that encoding.
+ *
+ * @see Document#setBoost(float)
+ * @see Similarity#normalizeLength(int)
+ * @see Similarity#encodeNorm(float)
+ */
+ public void setBoost(float boost) {
+ this.boost = boost;
+ }
+
+ /** Returns the boost factor for hits on any field of this document.
+ *
+ * The default value is 1.0.
+ *
+ * Note: this value is not stored directly with the document in the index.
+ * Documents returned from {@link IndexReader#document(int)} and {@link
+ * Hits#doc(int)} may thus not have the same value present as when this field
+ * was indexed.
+ *
+ * @see #setBoost(float)
+ */
+ public float getBoost() {
+ return boost;
+ }

/** Constructs a String-valued Field that is not tokenized, but is indexed
and stored. Useful for non-text fields, e.g. date or url. */

1.2 +13 -4 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java

Index: DocumentWriter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DocumentWriter.java 18 Sep 2001 16:29:52 -0000 1.1
+++ DocumentWriter.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -59,6 +59,7 @@
import java.io.StringReader;
import java.util.Hashtable;
import java.util.Enumeration;
+import java.util.Arrays;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -100,6 +101,10 @@
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
+
+ fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
+ Arrays.fill(fieldBoosts, doc.getBoost());
+
invertDocument(doc);

// sort postingTable into an array
@@ -130,6 +135,7 @@
// Used to buffer a document before it is written to the index.
private final Hashtable postingTable = new Hashtable();
private int[] fieldLengths;
+ private float[] fieldBoosts;

// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
@@ -168,6 +174,7 @@
}

fieldLengths[fieldNumber] = position; // save field length
+ fieldBoosts[fieldNumber] *= field.getBoost();
}
}
}
@@ -310,12 +317,14 @@
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
if (field.isIndexed()) {
- int fieldNumber = fieldInfos.fieldNumber(field.name());
- OutputStream norm = directory.createFile(segment + ".f" + fieldNumber);
+ int n = fieldInfos.fieldNumber(field.name());
+ float norm =
+ fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]);
+ OutputStream norms = directory.createFile(segment + ".f" + n);
try {
- norm.writeByte(Similarity.norm(fieldLengths[fieldNumber]));
+ norms.writeByte(Similarity.encodeNorm(norm));
} finally {
- norm.close();
+ norms.close();
}
}
}

1.10 +5 -3 jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java

Index: IndexReader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- IndexReader.java 15 Feb 2002 18:59:42 -0000 1.9
+++ IndexReader.java 29 Jul 2002 19:11:15 -0000 1.10
@@ -60,6 +60,7 @@
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;

/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -177,9 +178,10 @@
abstract public boolean isDeleted(int n);

/** Returns the byte-encoded normalization factor for the named field of
- every document. This is used by the search code to score documents.
- @see org.apache.lucene.search.Similarity#norm
- */
+ * every document. This is used by the search code to score documents.
+ *
+ * @see Field#setBoost(float)
+ */
abstract public byte[] norms(String field) throws IOException;

/** Returns an enumeration of all the terms in the index.

1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java

Index: PhrasePrefixQuery.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- PhrasePrefixQuery.java 18 Jul 2002 14:39:58 -0000 1.1
+++ PhrasePrefixQuery.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -66,7 +66,7 @@

/**
* PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
- * method {@link add(Term[])}.
+ * method {@link #add(Term[])}.
* To use this class, to search for the phrase "Microsoft app*" first use
* add(Term) on the term "Microsoft", then find all terms that has "app" as
* prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]

1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java

Index: PhraseScorer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- PhraseScorer.java 18 Sep 2001 16:29:57 -0000 1.1
+++ PhraseScorer.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -93,7 +93,7 @@

if (freq > 0.0) {
float score = Similarity.tf(freq)*weight; // compute score
- score *= Similarity.norm(norms[first.doc]); // normalize
+ score *= Similarity.decodeNorm(norms[first.doc]); // normalize
results.collect(first.doc, score); // add to results
}
last.next(); // resume scanning

1.2 +69 -20 jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java

Index: Similarity.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Similarity.java 18 Sep 2001 16:29:58 -0000 1.1
+++ Similarity.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -56,6 +56,7 @@

import java.io.IOException;
import org.apache.lucene.index.Term;
+import org.apache.lucene.document.Field;

/** Internal class used for scoring.
* Public only so that the indexing code can compute and store the
@@ -63,32 +64,80 @@
public final class Similarity {
private Similarity() {} // no public constructor

- /** Computes the normalization byte for a document given the total number of
- * terms contained in the document. These values are stored in an index and
- * used by the search code. */
- public static final byte norm(int numTerms) {
- // Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms).
- // Math.ceil is used to ensure that even very long documents don't get a
- // zero norm byte, as that is reserved for zero-lengthed documents and
- // deleted documents.
- return (byte) Math.ceil(255.0 / Math.sqrt(numTerms));
+ static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++)
+ NORM_TABLE[i] = byteToFloat((byte)i);
}

+ /** Computes the normalization value for a document given the total number of
+ * terms contained in a field. These values are stored in an index and used
+ * by the search code.
+ *
+ * The formula used is: <code>1.0f / Math.sqrt(numTerms)</code>
+ *
+ * @see Field#setBoost(float)
+ */
+ public static float normalizeLength(int numTerms) {
+ return (float)(1.0 / Math.sqrt(numTerms));
+ }
+
+ /** Decodes a normalization factor stored in an index.
+ * @see #encodeNorm(float)
+ */
+ public static float decodeNorm(byte b) {
+ return NORM_TABLE[b & 0xFF];
+ }

- private static final float[] makeNormTable() {
- float[] result = new float[256];
- for (int i = 0; i < 256; i++)
- result[i] = i / 255.0F;
- return result;
+ /** Encodes a normalization factor for storage in an index.
+ *
+ * The encoding uses a five-bit exponent and three-bit mantissa, thus
+ * representing values from around 7x10^9 to 2x10^-9 with about one
+ * significant decimal digit of accuracy. Zero is also represented.
+ * Negative numbers are rounded up to zero. Values too large to represent
+ * are rounded down to the largest representable value. Positive values too
+ * small to represent are rounded up to the smallest positive representable
+ * value.
+ *
+ * @see Field#setBoost(float)
+ */
+ public static byte encodeNorm(float f) {
+ return floatToByte(f);
}

- static final float[] NORM_TABLE = makeNormTable();
-
- static final float norm(byte normByte) {
- // Un-scales from the byte encoding of a norm into a float, i.e.,
- // approximately 1/sqrt(numTerms).
- return NORM_TABLE[normByte & 0xFF];
+ private static float byteToFloat(byte b) {
+ if (b == 0) // zero is a special case
+ return 0.0f;
+ int mantissa = b & 7;
+ int exponent = (b >> 3) & 31;
+ int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
+ return Float.intBitsToFloat(bits);
}
+
+ private static byte floatToByte(float f) {
+ if (f < 0.0f) // round negatives up to zero
+ f = 0.0f;
+
+ if (f == 0.0f) // zero is a special case
+ return 0;
+
+ int bits = Float.floatToIntBits(f); // parse float into parts
+ int mantissa = (bits & 0xffffff) >> 21;
+ int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
+
+ if (exponent > 31) { // overflow: use max value
+ exponent = 31;
+ mantissa = 7;
+ }
+
+ if (exponent < 1) { // underflow: use min value
+ exponent = 1;
+ mantissa = 0;
+ }
+
+ return (byte)((exponent << 3) | mantissa); // pack into a byte
+ }

static final float tf(int freq) {
return (float)Math.sqrt(freq);

1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java

Index: TermScorer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TermScorer.java 18 Sep 2001 16:29:58 -0000 1.1
+++ TermScorer.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -98,7 +98,7 @@
? scoreCache[f] // cache hit
: Similarity.tf(f)*weight; // cache miss

- score *= Similarity.norm(norms[d]); // normalize for field
+ score *= Similarity.decodeNorm(norms[d]); // normalize for field

c.collect(d, score); // collect score

1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java

Index: TestDocBoost.java
===================================================================
package org.apache.lucene.search;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import junit.framework.TestCase;

/** Document boost unit test.
*
* @author Doug Cutting
* @version $Revision: 1.1 $
*/
public class TestDocBoost extends TestCase {
public TestDocBoost(String name) {
super(name);
}

public static void test() throws Exception {
RAMDirectory store = new RAMDirectory();
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);

Field f1 = Field.Text("field", "word");
Field f2 = Field.Text("field", "word");
f2.setBoost(2.0f);

Document d1 = new Document();
Document d2 = new Document();
Document d3 = new Document();
Document d4 = new Document();
d3.setBoost(3.0f);
d4.setBoost(2.0f);

d1.add(f1); // boost = 1
d2.add(f2); // boost = 2
d3.add(f1); // boost = 3
d4.add(f2); // boost = 4

writer.addDocument(d1);
writer.addDocument(d2);
writer.addDocument(d3);
writer.addDocument(d4);
writer.optimize();
writer.close();

final float[] scores = new float[4];

new IndexSearcher(store).search
(new TermQuery(new Term("field", "word")),
new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});

float lastScore = 0.0f;

for (int i = 0; i < 4; i++) {
assertTrue(scores[i] > lastScore);
lastScore = scores[i];
}
}
}

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>