Mailing List Archive

Keyword search with punctuation?
I am unable to find the expected documents when I execute a query on a
keyword whose value contains punctuation.

Here is the test case:

import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;

public class KeywordTest {

public static void main(String args[]) throws Exception {

Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter("index", analyzer, true);

Document d3 = new Document();
d3.add(Field.Keyword("id","person/john"));
d3.add(Field.Text("macrinstfulltext","male married"));
writer.addDocument(d3);

Document d4 = new Document();
d4.add(Field.Keyword("id","person/wendy"));
d4.add(Field.Text("macrinstfulltext","female Single"));
writer.addDocument(d4);

Document d5 = new Document();
d5.add(Field.Keyword("id","person123"));
d5.add(Field.Text("macrinstfulltext","female Single"));
writer.addDocument(d5);

writer.close();

IndexReader reader = IndexReader.open("index");
Searcher searcher = new IndexSearcher(reader);

search(searcher, analyzer, "id:person/john");
// this query doesn't find anything
search(searcher, analyzer, "id:person/wendy");
// this query doesn't find anything
search(searcher, analyzer, "id:person123");
// this query works
search(searcher, analyzer, "id:person/john id:person/wendy"); //
this query doesn't find anything
}

private static void search(Searcher searcher, Analyzer analyzer,
String queryString) throws Exception {
Query query = QueryParser.parse(queryString, "macrfulltext",
analyzer);
Hits hits = searcher.search(query);
System.out.println(query.toString("macrfulltext") + ": " +
hits.length() + " hits");
for (int i = 0; i < hits.length(); i++) {
System.out.println(" " + hits.doc(i).get("id"));
}
}
}
Re: Keyword search with punctuation? [ In reply to ]
I guess the StandardAnalyzer is splitting "person/john" into
"person john" before the comparison gets done with "person/john"
as stored in the index. Does seem to make Field.Keyword of limited
use so perhaps I'm missing something.

Possible workarounds might include writing your own analyzer/
tokenizer combination that doesn't split up "person/john" if
the search field is "id", or reformatting the id field so that
it doesn't contain punctuation. Neither seem very nice!




--
Ian.
ian.lea@blackwell.co.uk


Paul Friedman wrote:
>
> I am unable to find the expected documents when I execute a query on a
> keyword whose value contains punctuation.
>
> Here is the test case:
>
> import org.apache.lucene.queryParser.*;
> import org.apache.lucene.search.*;
> import org.apache.lucene.index.*;
> import org.apache.lucene.analysis.*;
> import org.apache.lucene.analysis.standard.*;
> import org.apache.lucene.document.*;
>
> public class KeywordTest {
>
> public static void main(String args[]) throws Exception {
>
> Analyzer analyzer = new StandardAnalyzer();
> IndexWriter writer = new IndexWriter("index", analyzer, true);
>
> Document d3 = new Document();
> d3.add(Field.Keyword("id","person/john"));
> d3.add(Field.Text("macrinstfulltext","male married"));
> writer.addDocument(d3);
>
> Document d4 = new Document();
> d4.add(Field.Keyword("id","person/wendy"));
> d4.add(Field.Text("macrinstfulltext","female Single"));
> writer.addDocument(d4);
>
> Document d5 = new Document();
> d5.add(Field.Keyword("id","person123"));
> d5.add(Field.Text("macrinstfulltext","female Single"));
> writer.addDocument(d5);
>
> writer.close();
>
> IndexReader reader = IndexReader.open("index");
> Searcher searcher = new IndexSearcher(reader);
>
> search(searcher, analyzer, "id:person/john");
> // this query doesn't find anything
> search(searcher, analyzer, "id:person/wendy");
> // this query doesn't find anything
> search(searcher, analyzer, "id:person123");
> // this query works
> search(searcher, analyzer, "id:person/john id:person/wendy"); //
> this query doesn't find anything
> }
>
> private static void search(Searcher searcher, Analyzer analyzer,
> String queryString) throws Exception {
> Query query = QueryParser.parse(queryString, "macrfulltext",
> analyzer);
> Hits hits = searcher.search(query);
> System.out.println(query.toString("macrfulltext") + ": " +
> hits.length() + " hits");
> for (int i = 0; i < hits.length(); i++) {
> System.out.println(" " + hits.doc(i).get("id"));
> }
> }
> }
>
>

--
To unsubscribe, e-mail: <mailto:lucene-user-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-user-help@jakarta.apache.org>
RE: Keyword search with punctuation? [ In reply to ]
This really seems like a bug to me. If I do the indexing using the StandardAnalyzer and do the search using the same analyzer, then shouldn't I be able to find the document?

Thanks.
Paul

-----Original Message-----
From: Ian Lea [mailto:ian.lea@blackwell.co.uk]
Sent: Thursday, November 08, 2001 5:50 AM
To: Lucene Users List; Paul Friedman
Subject: Re: Keyword search with punctuation?


I guess the StandardAnalyzer is splitting "person/john" into
"person john" before the comparison gets done with "person/john"
as stored in the index. Does seem to make Field.Keyword of limited
use so perhaps I'm missing something.

Possible workarounds might include writing your own analyzer/
tokenizer combination that doesn't split up "person/john" if
the search field is "id", or reformatting the id field so that
it doesn't contain punctuation. Neither seem very nice!




--
Ian.
ian.lea@blackwell.co.uk


Paul Friedman wrote:
>
> I am unable to find the expected documents when I execute a query on a
> keyword whose value contains punctuation.
>
> Here is the test case:
>
> import org.apache.lucene.queryParser.*;
> import org.apache.lucene.search.*;
> import org.apache.lucene.index.*;
> import org.apache.lucene.analysis.*;
> import org.apache.lucene.analysis.standard.*;
> import org.apache.lucene.document.*;
>
> public class KeywordTest {
>
> public static void main(String args[]) throws Exception {
>
> Analyzer analyzer = new StandardAnalyzer();
> IndexWriter writer = new IndexWriter("index", analyzer, true);
>
> Document d3 = new Document();
> d3.add(Field.Keyword("id","person/john"));
> d3.add(Field.Text("macrinstfulltext","male married"));
> writer.addDocument(d3);
>
> Document d4 = new Document();
> d4.add(Field.Keyword("id","person/wendy"));
> d4.add(Field.Text("macrinstfulltext","female Single"));
> writer.addDocument(d4);
>
> Document d5 = new Document();
> d5.add(Field.Keyword("id","person123"));
> d5.add(Field.Text("macrinstfulltext","female Single"));
> writer.addDocument(d5);
>
> writer.close();
>
> IndexReader reader = IndexReader.open("index");
> Searcher searcher = new IndexSearcher(reader);
>
> search(searcher, analyzer, "id:person/john");
> // this query doesn't find anything
> search(searcher, analyzer, "id:person/wendy");
> // this query doesn't find anything
> search(searcher, analyzer, "id:person123");
> // this query works
> search(searcher, analyzer, "id:person/john id:person/wendy"); //
> this query doesn't find anything
> }
>
> private static void search(Searcher searcher, Analyzer analyzer,
> String queryString) throws Exception {
> Query query = QueryParser.parse(queryString, "macrfulltext",
> analyzer);
> Hits hits = searcher.search(query);
> System.out.println(query.toString("macrfulltext") + ": " +
> hits.length() + " hits");
> for (int i = 0; i < hits.length(); i++) {
> System.out.println(" " + hits.doc(i).get("id"));
> }
> }
> }
>
>

--
To unsubscribe, e-mail: <mailto:lucene-user-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-user-help@jakarta.apache.org>