Mailing List Archive

Escaping special character doesn't return result
Hi,

I am using Apache Lucene 8.5.0 version



I have written a simple program to create index of words with special
character.



Example I have indexed the word *temp/hello*



Now I want to search the word with wildcard query *te*/hello*



I get the error* : *Exception in thread "main"
*org.apache.lucene.queryparser.classic.ParseException*: Cannot parse
'te*/hello': Lexical error at line 1, column 10. Encountered: <EOF> after
: "/hello"



When I escape the query with QueryParser.escape method it doesn’t yield any
results when special characters are part of query



*Can someone suggest the right way for indexing and searching words with
special characters?*



Here’s my simple program



*import* java.io.BufferedReader;

*import* java.io.IOException;

*import* java.io.InputStreamReader;

*import* java.nio.file.Paths;



*import* org.apache.lucene.analysis.Analyzer;

*import* org.apache.lucene.analysis.custom.CustomAnalyzer;

*import** org.apache.lucene.analysis.standard.StandardAnalyzer;*

*import* org.apache.lucene.document.Document;

*import* org.apache.lucene.document.Field;

*import* org.apache.lucene.document.TextField;

*import* org.apache.lucene.index.DirectoryReader;

*import* org.apache.lucene.index.IndexReader;

*import* org.apache.lucene.index.IndexWriter;

*import* org.apache.lucene.index.IndexWriterConfig;

*import* org.apache.lucene.index.IndexWriterConfig.OpenMode;

*import* org.apache.lucene.queryparser.classic.ParseException;

*import* org.apache.lucene.queryparser.classic.QueryParser;

*import* org.apache.lucene.search.IndexSearcher;

*import* org.apache.lucene.search.Query;

*import* org.apache.lucene.search.ScoreDoc;

*import* org.apache.lucene.search.TopDocs;

*import* org.apache.lucene.store.Directory;

*import* org.apache.lucene.store.MMapDirectory;



*public* *class* HelloLucene {

*private* *static* Analyzer buildAnalyzer() *throws* IOException {

*return* CustomAnalyzer.*builder*()

.withTokenizer("keyWord")

.addTokenFilter("lowercase")

.build();



}





*public* *static* *void* main(String[] args) *throws* IOException,
ParseException {

Analyzer analyzer = *buildAnalyzer*();

// 1. create the index

Directory index = *new* MMapDirectory(Paths.*get*("c:\\temp\\index"
));



IndexWriterConfig config = *new* IndexWriterConfig(analyzer);



String indexType = "create";

*if* ("create".equals(indexType)) {

config.setOpenMode(OpenMode.*CREATE*);

} *else* {

config.setOpenMode(OpenMode.*CREATE_OR_APPEND*);

}

IndexWriter w = *new* IndexWriter(index, config);





*long* start = System.*currentTimeMillis*();

*addDoc*(w, "Temp/Hello", "Artifact");



*long* end = System.*currentTimeMillis*();

w.close();



*for* (*int* i = 0; i < 100; i++) {



// 2. query

BufferedReader input = *new* BufferedReader(*new*
InputStreamReader(System.*in*));

String query = input.readLine();



//Prefix Search



QueryParser queryParser = *new* QueryParser("Name",analyzer);

queryParser.setAllowLeadingWildcard(*true*);

Query q = queryParser.parse(QueryParser.*escape*(query));





// 3. search



*int* hitsPerPage = 10;

IndexReader reader = DirectoryReader.*open*(index);

IndexSearcher searcher = *new* IndexSearcher(reader);

TopDocs docs = searcher.search(q, hitsPerPage);

ScoreDoc[] hits = docs.scoreDocs;



// 4. display results

*System.**out*.println("Found " + hits.length + " hits.");

*for* (*int* j = 0; j < hits.length; ++j) {

*int* docId = hits[j].doc;

Document d = searcher.doc(docId);

*System.**out*.println((j + 1) + ". " + d.get("Name") + "\t"
+ d.get("Type"));

}



reader.close();

}



}



*private* *static* *void* addDoc(IndexWriter w, String name, String type)
*throws* IOException {

Document doc = *new* Document();

doc.add(*new* TextField("Name", name, Field.Store.*YES*));



// use a string field for *isbn* because we don't want it
*tokenized*

doc.add(*new* TextField("Type", type, Field.Store.*YES*));

w.addDocument(doc);

}

}
Re: Escaping special character doesn't return result [ In reply to ]
I'm not sure what your schema looks like, but it could be as simple as
escaping with *temp\/hello*

On Fri, Apr 3, 2020 at 6:00 AM deep <deepaktaker@gmail.com> wrote:

> Hi,
>
> I am using Apache Lucene 8.5.0 version
>
>
>
> I have written a simple program to create index of words with special
> character.
>
>
>
> Example I have indexed the word *temp/hello*
>
>
>
> Now I want to search the word with wildcard query *te*/hello*
>
>
>
> I get the error* : *Exception in thread "main"
> *org.apache.lucene.queryparser.classic.ParseException*: Cannot parse
> 'te*/hello': Lexical error at line 1, column 10. Encountered: <EOF> after
> : "/hello"
>
>
>
> When I escape the query with QueryParser.escape method it doesn’t yield any
> results when special characters are part of query
>
>
>
> *Can someone suggest the right way for indexing and searching words with
> special characters?*
>
>
>
> Here’s my simple program
>
>
>
> *import* java.io.BufferedReader;
>
> *import* java.io.IOException;
>
> *import* java.io.InputStreamReader;
>
> *import* java.nio.file.Paths;
>
>
>
> *import* org.apache.lucene.analysis.Analyzer;
>
> *import* org.apache.lucene.analysis.custom.CustomAnalyzer;
>
> *import** org.apache.lucene.analysis.standard.StandardAnalyzer;*
>
> *import* org.apache.lucene.document.Document;
>
> *import* org.apache.lucene.document.Field;
>
> *import* org.apache.lucene.document.TextField;
>
> *import* org.apache.lucene.index.DirectoryReader;
>
> *import* org.apache.lucene.index.IndexReader;
>
> *import* org.apache.lucene.index.IndexWriter;
>
> *import* org.apache.lucene.index.IndexWriterConfig;
>
> *import* org.apache.lucene.index.IndexWriterConfig.OpenMode;
>
> *import* org.apache.lucene.queryparser.classic.ParseException;
>
> *import* org.apache.lucene.queryparser.classic.QueryParser;
>
> *import* org.apache.lucene.search.IndexSearcher;
>
> *import* org.apache.lucene.search.Query;
>
> *import* org.apache.lucene.search.ScoreDoc;
>
> *import* org.apache.lucene.search.TopDocs;
>
> *import* org.apache.lucene.store.Directory;
>
> *import* org.apache.lucene.store.MMapDirectory;
>
>
>
> *public* *class* HelloLucene {
>
> *private* *static* Analyzer buildAnalyzer() *throws* IOException {
>
> *return* CustomAnalyzer.*builder*()
>
> .withTokenizer("keyWord")
>
> .addTokenFilter("lowercase")
>
> .build();
>
>
>
> }
>
>
>
>
>
> *public* *static* *void* main(String[] args) *throws* IOException,
> ParseException {
>
> Analyzer analyzer = *buildAnalyzer*();
>
> // 1. create the index
>
> Directory index = *new* MMapDirectory(Paths.*get*("c:\\temp\\index"
> ));
>
>
>
> IndexWriterConfig config = *new* IndexWriterConfig(analyzer);
>
>
>
> String indexType = "create";
>
> *if* ("create".equals(indexType)) {
>
> config.setOpenMode(OpenMode.*CREATE*);
>
> } *else* {
>
> config.setOpenMode(OpenMode.*CREATE_OR_APPEND*);
>
> }
>
> IndexWriter w = *new* IndexWriter(index, config);
>
>
>
>
>
> *long* start = System.*currentTimeMillis*();
>
> *addDoc*(w, "Temp/Hello", "Artifact");
>
>
>
> *long* end = System.*currentTimeMillis*();
>
> w.close();
>
>
>
> *for* (*int* i = 0; i < 100; i++) {
>
>
>
> // 2. query
>
> BufferedReader input = *new* BufferedReader(*new*
> InputStreamReader(System.*in*));
>
> String query = input.readLine();
>
>
>
> //Prefix Search
>
>
>
> QueryParser queryParser = *new* QueryParser("Name",analyzer);
>
> queryParser.setAllowLeadingWildcard(*true*);
>
> Query q = queryParser.parse(QueryParser.*escape*(query));
>
>
>
>
>
> // 3. search
>
>
>
> *int* hitsPerPage = 10;
>
> IndexReader reader = DirectoryReader.*open*(index);
>
> IndexSearcher searcher = *new* IndexSearcher(reader);
>
> TopDocs docs = searcher.search(q, hitsPerPage);
>
> ScoreDoc[] hits = docs.scoreDocs;
>
>
>
> // 4. display results
>
> *System.**out*.println("Found " + hits.length + " hits.");
>
> *for* (*int* j = 0; j < hits.length; ++j) {
>
> *int* docId = hits[j].doc;
>
> Document d = searcher.doc(docId);
>
> *System.**out*.println((j + 1) + ". " + d.get("Name") +
> "\t"
> + d.get("Type"));
>
> }
>
>
>
> reader.close();
>
> }
>
>
>
> }
>
>
>
> *private* *static* *void* addDoc(IndexWriter w, String name, String
> type)
> *throws* IOException {
>
> Document doc = *new* Document();
>
> doc.add(*new* TextField("Name", name, Field.Store.*YES*));
>
>
>
> // use a string field for *isbn* because we don't want it
> *tokenized*
>
> doc.add(*new* TextField("Type", type, Field.Store.*YES*));
>
> w.addDocument(doc);
>
> }
>
> }
>


--
Steve Lacerda
e. steve.lacerda@datastax.com
w. www.datastax.com
Re: Escaping special character doesn't return result [ In reply to ]
1) this sort of question would be best sent to java-user@lucene
2) please don't try to "bold" or otherwise emphasis parts of your email
using "*" characters, especially when your question is about using "*"
characters for wildcard searching -- makes it kind of confusing to
understand what you're asking.

: Example I have indexed the word *temp/hello*

I'm going to assume you mean the string literal "Temp/Hello" (per what i
see in the code you posted) and those "*" characters were just an attempt
to "bold" your input.

: Now I want to search the word with wildcard query *te*/hello*

Since i've already been forced to assume that you're occasionally using
"*" characters to bold things, i'm going to assume that the actual input
you are giving to the query parser is "te*/hello"

: I get the error* : *Exception in thread "main"
: *org.apache.lucene.queryparser.classic.ParseException*: Cannot parse
: 'te*/hello': Lexical error at line 1, column 10. Encountered: <EOF> after
: : "/hello"

this has nothing to do with the "*" character in your query string -- it
has everything to do with the "/" character in your query string, whih
indicates to the query parser that you wish to do a regex search...

https://lucene.apache.org/core/8_5_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Regexp_Searches

...It's a "start delimiter" character, but you never have the
corrisponding "end delimiter" character telling it when the regex ends,
which is why you get a parse error

: When I escape the query with QueryParser.escape method it doesn’t yield any
: results when special characters are part of query

that's becuase QueryParser.escape will escape ALL of the meta-characters
hat are significant to the query parser, including both "*" and "/" in
your input, treating them as literals, which means your "*" won't be used
to indicate a wildcard search, it will be treated like any other character
and then either kept or removed by your analyzer -- kept it looks like in
this particular case -- and your query will fail to match your original
document because the indexed (lowercased) value does not contain the
literal sequence of characters "te*/hello"

In short: if you want to use the query parser and specify meta-characters
for things like wildcard queries, you have to be responsible for escaping
any meta-characters (like "/") that you want treated as literals for that
particular query.

you can use QueryParser.escape() to help -- but you can't escape hte whole
query, just the parts of the query you want treated as string literals...

String q = QueryParser.escape("te") + "*" + QueryParser.escape("/hello");


:
:
: *Can someone suggest the right way for indexing and searching words with
: special characters?*
:
:
:
: Here’s my simple program
:
:
:
: *import* java.io.BufferedReader;
:
: *import* java.io.IOException;
:
: *import* java.io.InputStreamReader;
:
: *import* java.nio.file.Paths;
:
:
:
: *import* org.apache.lucene.analysis.Analyzer;
:
: *import* org.apache.lucene.analysis.custom.CustomAnalyzer;
:
: *import** org.apache.lucene.analysis.standard.StandardAnalyzer;*
:
: *import* org.apache.lucene.document.Document;
:
: *import* org.apache.lucene.document.Field;
:
: *import* org.apache.lucene.document.TextField;
:
: *import* org.apache.lucene.index.DirectoryReader;
:
: *import* org.apache.lucene.index.IndexReader;
:
: *import* org.apache.lucene.index.IndexWriter;
:
: *import* org.apache.lucene.index.IndexWriterConfig;
:
: *import* org.apache.lucene.index.IndexWriterConfig.OpenMode;
:
: *import* org.apache.lucene.queryparser.classic.ParseException;
:
: *import* org.apache.lucene.queryparser.classic.QueryParser;
:
: *import* org.apache.lucene.search.IndexSearcher;
:
: *import* org.apache.lucene.search.Query;
:
: *import* org.apache.lucene.search.ScoreDoc;
:
: *import* org.apache.lucene.search.TopDocs;
:
: *import* org.apache.lucene.store.Directory;
:
: *import* org.apache.lucene.store.MMapDirectory;
:
:
:
: *public* *class* HelloLucene {
:
: *private* *static* Analyzer buildAnalyzer() *throws* IOException {
:
: *return* CustomAnalyzer.*builder*()
:
: .withTokenizer("keyWord")
:
: .addTokenFilter("lowercase")
:
: .build();
:
:
:
: }
:
:
:
:
:
: *public* *static* *void* main(String[] args) *throws* IOException,
: ParseException {
:
: Analyzer analyzer = *buildAnalyzer*();
:
: // 1. create the index
:
: Directory index = *new* MMapDirectory(Paths.*get*("c:\\temp\\index"
: ));
:
:
:
: IndexWriterConfig config = *new* IndexWriterConfig(analyzer);
:
:
:
: String indexType = "create";
:
: *if* ("create".equals(indexType)) {
:
: config.setOpenMode(OpenMode.*CREATE*);
:
: } *else* {
:
: config.setOpenMode(OpenMode.*CREATE_OR_APPEND*);
:
: }
:
: IndexWriter w = *new* IndexWriter(index, config);
:
:
:
:
:
: *long* start = System.*currentTimeMillis*();
:
: *addDoc*(w, "Temp/Hello", "Artifact");
:
:
:
: *long* end = System.*currentTimeMillis*();
:
: w.close();
:
:
:
: *for* (*int* i = 0; i < 100; i++) {
:
:
:
: // 2. query
:
: BufferedReader input = *new* BufferedReader(*new*
: InputStreamReader(System.*in*));
:
: String query = input.readLine();
:
:
:
: //Prefix Search
:
:
:
: QueryParser queryParser = *new* QueryParser("Name",analyzer);
:
: queryParser.setAllowLeadingWildcard(*true*);
:
: Query q = queryParser.parse(QueryParser.*escape*(query));
:
:
:
:
:
: // 3. search
:
:
:
: *int* hitsPerPage = 10;
:
: IndexReader reader = DirectoryReader.*open*(index);
:
: IndexSearcher searcher = *new* IndexSearcher(reader);
:
: TopDocs docs = searcher.search(q, hitsPerPage);
:
: ScoreDoc[] hits = docs.scoreDocs;
:
:
:
: // 4. display results
:
: *System.**out*.println("Found " + hits.length + " hits.");
:
: *for* (*int* j = 0; j < hits.length; ++j) {
:
: *int* docId = hits[j].doc;
:
: Document d = searcher.doc(docId);
:
: *System.**out*.println((j + 1) + ". " + d.get("Name") + "\t"
: + d.get("Type"));
:
: }
:
:
:
: reader.close();
:
: }
:
:
:
: }
:
:
:
: *private* *static* *void* addDoc(IndexWriter w, String name, String type)
: *throws* IOException {
:
: Document doc = *new* Document();
:
: doc.add(*new* TextField("Name", name, Field.Store.*YES*));
:
:
:
: // use a string field for *isbn* because we don't want it
: *tokenized*
:
: doc.add(*new* TextField("Type", type, Field.Store.*YES*));
:
: w.addDocument(doc);
:
: }
:
: }
:

-Hoss
http://www.lucidworks.com/