Mailing List Archive

cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestDocBoost.java
cutting 2002/07/29 12:11:15

Modified: . CHANGES.txt
src/java/org/apache/lucene/document Document.java Field.java
src/java/org/apache/lucene/index DocumentWriter.java
IndexReader.java
src/java/org/apache/lucene/search PhrasePrefixQuery.java
PhraseScorer.java Similarity.java TermScorer.java
Added: src/test/org/apache/lucene/search TestDocBoost.java
Log:
msg.txt

Revision Changes Path
1.28 +12 -1 jakarta-lucene/CHANGES.txt

Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.27
retrieving revision 1.28
diff -u -r1.27 -r1.28
--- CHANGES.txt 26 Jul 2002 17:32:54 -0000 1.27
+++ CHANGES.txt 29 Jul 2002 19:11:14 -0000 1.28
@@ -48,6 +48,17 @@
stems from nouns and verbs derived from the same word.
(gschwarz)

+ 12. Added support for boosting the score of documents and fields via
+ the new methods Document.setBoost(float) and Field.setBoost(float).
+
+ Note: This changes the encoding of an indexed value. Indexes
+ should be re-created from scratch in order for search scores to
+ be correct. With the new code and an old index, searches will
+ yield very large scores for shorter fields, and very small scores
+ for longer fields. Once the index is re-created, scores will be
+ as before. (cutting)
+
+
1.2 RC6

1. Changed QueryParser.jj to have "?" be a special character which



1.3 +32 -0 jakarta-lucene/src/java/org/apache/lucene/document/Document.java

Index: Document.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Document.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Document.java 17 Jul 2002 21:54:38 -0000 1.2
+++ Document.java 29 Jul 2002 19:11:14 -0000 1.3
@@ -55,6 +55,8 @@
*/

import java.util.Enumeration;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Hits;

/** Documents are the unit of indexing and search.
*
@@ -66,9 +68,39 @@

public final class Document implements java.io.Serializable {
DocumentFieldList fieldList = null;
+ private float boost = 1.0f;

/** Constructs a new document with no fields. */
public Document() {}
+
+
+ /** Sets a boost factor for hits on any field of this document. This value
+ * will be multiplied into the score of all hits on this document.
+ *
+ * <p>Values are multiplied into the value of {@link Field#getBoost()} of
+ * each field in this document. Thus, this method in effect sets a default
+ * boost for the fields of this document.
+ *
+ * @see Field#setBoost(float)
+ */
+ public void setBoost(float boost) {
+ this.boost = boost;
+ }
+
+ /** Returns the boost factor for hits on any field of this document.
+ *
+ * <p>The default value is 1.0.
+ *
+ * <p>Note: This value is not stored directly with the document in the index.
+ * Documents returned from {@link IndexReader#document(int)} and {@link
+ * Hits#doc(int)} may thus not have the same value present as when this
+ * document was indexed.
+ *
+ * @see #setBoost(float)
+ */
+ public float getBoost() {
+ return boost;
+ }

/** Adds a field to a document. Several fields may be added with
* the same name. In this case, if the fields are indexed, their text is



1.7 +40 -0 jakarta-lucene/src/java/org/apache/lucene/document/Field.java

Index: Field.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- Field.java 17 Jul 2002 21:54:38 -0000 1.6
+++ Field.java 29 Jul 2002 19:11:14 -0000 1.7
@@ -56,6 +56,9 @@

import java.io.Reader;
import java.util.Date;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.Hits;

/**
A field is a section of a Document. Each field has two parts, a name and a
@@ -72,6 +75,43 @@
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
+
+ private float boost = 1.0f;
+
+ /** Sets the boost factor hits on this field. This value will be
+ * multiplied into the score of all hits on this this field of this
+ * document.
+ *
+ * <p>The boost is multiplied by {@link Document#getBoost()} of the document
+ * containing this field. If a document has multiple fields with the same
+ * name, all such values are multiplied together. This product is then
+ * multipled by the value {@link Similarity#normalizeLength(int)}, and
+ * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
+ * index. One should attempt to ensure that this product does not overflow
+ * the range of that encoding.
+ *
+ * @see Document#setBoost(float)
+ * @see Similarity#normalizeLength(int)
+ * @see Similarity#encodeNorm(float)
+ */
+ public void setBoost(float boost) {
+ this.boost = boost;
+ }
+
+ /** Returns the boost factor for hits on any field of this document.
+ *
+ * <p>The default value is 1.0.
+ *
+ * <p>Note: this value is not stored directly with the document in the index.
+ * Documents returned from {@link IndexReader#document(int)} and {@link
+ * Hits#doc(int)} may thus not have the same value present as when this field
+ * was indexed.
+ *
+ * @see #setBoost(float)
+ */
+ public float getBoost() {
+ return boost;
+ }

/** Constructs a String-valued Field that is not tokenized, but is indexed
and stored. Useful for non-text fields, e.g. date or url. */



1.2 +13 -4 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java

Index: DocumentWriter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DocumentWriter.java 18 Sep 2001 16:29:52 -0000 1.1
+++ DocumentWriter.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -59,6 +59,7 @@
import java.io.StringReader;
import java.util.Hashtable;
import java.util.Enumeration;
+import java.util.Arrays;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -100,6 +101,10 @@
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
+
+ fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
+ Arrays.fill(fieldBoosts, doc.getBoost());
+
invertDocument(doc);

// sort postingTable into an array
@@ -130,6 +135,7 @@
// Used to buffer a document before it is written to the index.
private final Hashtable postingTable = new Hashtable();
private int[] fieldLengths;
+ private float[] fieldBoosts;

// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
@@ -168,6 +174,7 @@
}

fieldLengths[fieldNumber] = position; // save field length
+ fieldBoosts[fieldNumber] *= field.getBoost();
}
}
}
@@ -310,12 +317,14 @@
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
if (field.isIndexed()) {
- int fieldNumber = fieldInfos.fieldNumber(field.name());
- OutputStream norm = directory.createFile(segment + ".f" + fieldNumber);
+ int n = fieldInfos.fieldNumber(field.name());
+ float norm =
+ fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]);
+ OutputStream norms = directory.createFile(segment + ".f" + n);
try {
- norm.writeByte(Similarity.norm(fieldLengths[fieldNumber]));
+ norms.writeByte(Similarity.encodeNorm(norm));
} finally {
- norm.close();
+ norms.close();
}
}
}



1.10 +5 -3 jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java

Index: IndexReader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- IndexReader.java 15 Feb 2002 18:59:42 -0000 1.9
+++ IndexReader.java 29 Jul 2002 19:11:15 -0000 1.10
@@ -60,6 +60,7 @@
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;

/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -177,9 +178,10 @@
abstract public boolean isDeleted(int n);

/** Returns the byte-encoded normalization factor for the named field of
- every document. This is used by the search code to score documents.
- @see org.apache.lucene.search.Similarity#norm
- */
+ * every document. This is used by the search code to score documents.
+ *
+ * @see Field#setBoost(float)
+ */
abstract public byte[] norms(String field) throws IOException;

/** Returns an enumeration of all the terms in the index.



1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java

Index: PhrasePrefixQuery.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- PhrasePrefixQuery.java 18 Jul 2002 14:39:58 -0000 1.1
+++ PhrasePrefixQuery.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -66,7 +66,7 @@

/**
* PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
- * method {@link add(Term[])}.
+ * method {@link #add(Term[])}.
* To use this class, to search for the phrase "Microsoft app*" first use
* add(Term) on the term "Microsoft", then find all terms that has "app" as
* prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]



1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java

Index: PhraseScorer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- PhraseScorer.java 18 Sep 2001 16:29:57 -0000 1.1
+++ PhraseScorer.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -93,7 +93,7 @@

if (freq > 0.0) {
float score = Similarity.tf(freq)*weight; // compute score
- score *= Similarity.norm(norms[first.doc]); // normalize
+ score *= Similarity.decodeNorm(norms[first.doc]); // normalize
results.collect(first.doc, score); // add to results
}
last.next(); // resume scanning



1.2 +69 -20 jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java

Index: Similarity.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Similarity.java 18 Sep 2001 16:29:58 -0000 1.1
+++ Similarity.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -56,6 +56,7 @@

import java.io.IOException;
import org.apache.lucene.index.Term;
+import org.apache.lucene.document.Field;

/** Internal class used for scoring.
* <p>Public only so that the indexing code can compute and store the
@@ -63,32 +64,80 @@
public final class Similarity {
private Similarity() {} // no public constructor

- /** Computes the normalization byte for a document given the total number of
- * terms contained in the document. These values are stored in an index and
- * used by the search code. */
- public static final byte norm(int numTerms) {
- // Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms).
- // Math.ceil is used to ensure that even very long documents don't get a
- // zero norm byte, as that is reserved for zero-lengthed documents and
- // deleted documents.
- return (byte) Math.ceil(255.0 / Math.sqrt(numTerms));
+ static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++)
+ NORM_TABLE[i] = byteToFloat((byte)i);
}

+ /** Computes the normalization value for a document given the total number of
+ * terms contained in a field. These values are stored in an index and used
+ * by the search code.
+ *
+ * <p>The formula used is: <code>1.0f / Math.sqrt(numTerms)</code>
+ *
+ * @see Field#setBoost(float)
+ */
+ public static float normalizeLength(int numTerms) {
+ return (float)(1.0 / Math.sqrt(numTerms));
+ }
+
+ /** Decodes a normalization factor stored in an index.
+ * @see #encodeNorm(float)
+ */
+ public static float decodeNorm(byte b) {
+ return NORM_TABLE[b & 0xFF];
+ }

- private static final float[] makeNormTable() {
- float[] result = new float[256];
- for (int i = 0; i < 256; i++)
- result[i] = i / 255.0F;
- return result;
+ /** Encodes a normalization factor for storage in an index.
+ *
+ * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
+ * representing values from around 7x10^9 to 2x10^-9 with about one
+ * significant decimal digit of accuracy. Zero is also represented.
+ * Negative numbers are rounded up to zero. Values too large to represent
+ * are rounded down to the largest representable value. Positive values too
+ * small to represent are rounded up to the smallest positive representable
+ * value.
+ *
+ * @see Field#setBoost(float)
+ */
+ public static byte encodeNorm(float f) {
+ return floatToByte(f);
}

- static final float[] NORM_TABLE = makeNormTable();
-
- static final float norm(byte normByte) {
- // Un-scales from the byte encoding of a norm into a float, i.e.,
- // approximately 1/sqrt(numTerms).
- return NORM_TABLE[normByte & 0xFF];
+ private static float byteToFloat(byte b) {
+ if (b == 0) // zero is a special case
+ return 0.0f;
+ int mantissa = b & 7;
+ int exponent = (b >> 3) & 31;
+ int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
+ return Float.intBitsToFloat(bits);
}
+
+ private static byte floatToByte(float f) {
+ if (f < 0.0f) // round negatives up to zero
+ f = 0.0f;
+
+ if (f == 0.0f) // zero is a special case
+ return 0;
+
+ int bits = Float.floatToIntBits(f); // parse float into parts
+ int mantissa = (bits & 0xffffff) >> 21;
+ int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
+
+ if (exponent > 31) { // overflow: use max value
+ exponent = 31;
+ mantissa = 7;
+ }
+
+ if (exponent < 1) { // underflow: use min value
+ exponent = 1;
+ mantissa = 0;
+ }
+
+ return (byte)((exponent << 3) | mantissa); // pack into a byte
+ }

static final float tf(int freq) {
return (float)Math.sqrt(freq);



1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java

Index: TermScorer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TermScorer.java 18 Sep 2001 16:29:58 -0000 1.1
+++ TermScorer.java 29 Jul 2002 19:11:15 -0000 1.2
@@ -98,7 +98,7 @@
? scoreCache[f] // cache hit
: Similarity.tf(f)*weight; // cache miss

- score *= Similarity.norm(norms[d]); // normalize for field
+ score *= Similarity.decodeNorm(norms[d]); // normalize for field

c.collect(d, score); // collect score




1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java

Index: TestDocBoost.java
===================================================================
package org.apache.lucene.search;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import junit.framework.TestCase;

/** Document boost unit test.
*
* @author Doug Cutting
* @version $Revision: 1.1 $
*/
public class TestDocBoost extends TestCase {
public TestDocBoost(String name) {
super(name);
}

public static void test() throws Exception {
RAMDirectory store = new RAMDirectory();
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);

Field f1 = Field.Text("field", "word");
Field f2 = Field.Text("field", "word");
f2.setBoost(2.0f);

Document d1 = new Document();
Document d2 = new Document();
Document d3 = new Document();
Document d4 = new Document();
d3.setBoost(3.0f);
d4.setBoost(2.0f);

d1.add(f1); // boost = 1
d2.add(f2); // boost = 2
d3.add(f1); // boost = 3
d4.add(f2); // boost = 4

writer.addDocument(d1);
writer.addDocument(d2);
writer.addDocument(d3);
writer.addDocument(d4);
writer.optimize();
writer.close();

final float[] scores = new float[4];

new IndexSearcher(store).search
(new TermQuery(new Term("field", "word")),
new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});

float lastScore = 0.0f;

for (int i = 0; i < 4; i++) {
assertTrue(scores[i] > lastScore);
lastScore = scores[i];
}
}
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>
Re: cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestDocBoost.java [ In reply to ]
cutting@apache.org wrote:
> Log:
> msg.txt

Oops. That log entry was supposed to read:

Added support for boosting the score of documents and fields via the
new methods Document.setBoost(float) and Field.setBoost(float).

Note: This changes the encoding of an indexed value. Indexes should
be re-created from scratch in order for search scores to be correct.
With the new code and an old index, searches will yield very large
scores for shorter fields, and very small scores for longer fields.
Once the index is re-created, scores will be as before.

Doug


--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>