Mailing List Archive: cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/standard StandardAnalyzer.java

otis 02/02/21 14:01:07

Modified: src/java/org/apache/lucene/analysis/standard
StandardAnalyzer.java
Log:
- 'De-finalized' the class per Doug's suggestion to make it easy to use
different lists of stop words.
- Added a few more words to the stop word list (MS' contribution via Alan).
- Re-indented the whole class.

Revision Changes Path
1.2 +41 -28 jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java

Index: StandardAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- StandardAnalyzer.java 18 Sep 2001 16:29:51 -0000 1.1
+++ StandardAnalyzer.java 21 Feb 2002 22:01:07 -0000 1.2
@@ -60,36 +60,49 @@

/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}. */
-public final class StandardAnalyzer extends Analyzer {
- private Hashtable stopTable;
+public class StandardAnalyzer extends Analyzer {
+ private Hashtable stopTable;

- /** An array containing some common English words that are not usually useful
- for searching. */
- public static final String[] STOP_WORDS = {
- "a", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it",
- "no", "not", "of", "on", "or", "s", "such",
- "t", "that", "the", "their", "then", "there", "these",
- "they", "this", "to", "was", "will", "with"
- };
+ /** An array containing some common English words that are usually not
+ useful for searching. */
+ public static final String[] STOP_WORDS = {
+ "0","1","2","3","4","5","6","7","8","9",
+ "$",
+ "about", "after", "all", "also", "an", "and",
+ "another", "any", "are", "as", "at", "be", "because",
+ "been", "before", "being", "between", "both", "but",
+ "by","came","can","come","could","did","do","does",
+ "each","else","for","from","get","got","has","had",
+ "he","have","her","here","him","himself","his","how",
+ "if","in","into","is","it","its","just","like","make",
+ "many","me","might","more","most","much","must","my",
+ "never","now","of","on","only","or","other","our","out",
+ "over","re","said","same","see","should","since","so",
+ "some","still","such","take","than","that","the","their",
+ "them","then","there","these","they","this","those","through",
+ "to","too","under","up","use","very","want","was","way","we",
+ "well","were","what","when","where","which","while","who","will",
+ "with","would","you","your", "a","b","c","d","e","f","g","h","i",
+ "j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"
+ };

- /** Builds an analyzer. */
- public StandardAnalyzer() {
- this(STOP_WORDS);
- }
+ /** Builds an analyzer. */
+ public StandardAnalyzer() {
+ this(STOP_WORDS);
+ }

- /** Builds an analyzer with the given stop words. */
- public StandardAnalyzer(String[] stopWords) {
- stopTable = StopFilter.makeStopTable(stopWords);
- }
+ /** Builds an analyzer with the given stop words. */
+ public StandardAnalyzer(String[] stopWords) {
+ stopTable = StopFilter.makeStopTable(stopWords);
+ }

- /** Constructs a {@link StandardTokenizer} filtered by a {@link
- * StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(reader);
- result = new StandardFilter(result);
- result = new LowerCaseFilter(result);
- result = new StopFilter(result, stopTable);
- return result;
- }
+ /** Constructs a {@link StandardTokenizer} filtered by a {@link
+ * StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new StandardTokenizer(reader);
+ result = new StandardFilter(result);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(result, stopTable);
+ return result;
+ }
}

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

otis 02/02/21 17:15:51

Modified: src/java/org/apache/lucene/analysis/standard
StandardAnalyzer.java
Log:
- Removed stop words added to the previous revision, so that they don't break
people's existing indices.

Revision Changes Path
1.3 +6 -19 jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java

Index: StandardAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- StandardAnalyzer.java 21 Feb 2002 22:01:07 -0000 1.2
+++ StandardAnalyzer.java 22 Feb 2002 01:15:51 -0000 1.3
@@ -66,24 +66,11 @@
/** An array containing some common English words that are usually not
useful for searching. */
public static final String[] STOP_WORDS = {
- "0","1","2","3","4","5","6","7","8","9",
- "$",
- "about", "after", "all", "also", "an", "and",
- "another", "any", "are", "as", "at", "be", "because",
- "been", "before", "being", "between", "both", "but",
- "by","came","can","come","could","did","do","does",
- "each","else","for","from","get","got","has","had",
- "he","have","her","here","him","himself","his","how",
- "if","in","into","is","it","its","just","like","make",
- "many","me","might","more","most","much","must","my",
- "never","now","of","on","only","or","other","our","out",
- "over","re","said","same","see","should","since","so",
- "some","still","such","take","than","that","the","their",
- "them","then","there","these","they","this","those","through",
- "to","too","under","up","use","very","want","was","way","we",
- "well","were","what","when","where","which","while","who","will",
- "with","would","you","your", "a","b","c","d","e","f","g","h","i",
- "j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"
+ "a", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "s", "such",
+ "t", "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
};

/** Builds an analyzer. */
@@ -97,7 +84,7 @@
}

/** Constructs a {@link StandardTokenizer} filtered by a {@link
- * StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+ StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);

--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>