Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo IndexFiles.java SearchFiles.java XMLDocumentHandlerDOM.java XMLDocumentHandlerSAX.java
otis 2002/06/21 08:02:51

Added: contributions/XML-Indexing-Demo IndexingRequest.xml
contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo
IndexFiles.java SearchFiles.java
XMLDocumentHandlerDOM.java
XMLDocumentHandlerSAX.java
Log:
- Unpacked the .zip file, for easier access to the source code.

Revision Changes Path
1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml

Index: IndexingRequest.xml
===================================================================
<customerInfo>
<name><![CDATA[Aruna A. Raghavan]]></name>
<profession><![CDATA[Software Developer]]></profession>
<addressLine1><![CDATA[6801 West 106th Street]]></addressLine1>
<addressLine2><![CDATA[#205]]></addressLine2>
<city><![CDATA[Eagan]]></city>
<state><![CDATA[MN]]></state>
<zip><![CDATA[55121]]></zip>
<country><![CDATA[USA]]></country>
</customerInfo>



1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java

Index: IndexFiles.java
===================================================================
package org.apache.lucenesandbox.xmlindexingdemo;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;

import java.io.File;
import java.util.Date;

class IndexFiles
{
public static void main(String[] args)
throws Exception
{
try
{
Date start = new Date();

IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
indexDocs(writer, new File(args[0]));

writer.optimize();
writer.close();

Date end = new Date();

System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");

}
catch (Exception e)
{
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
throw e;
}
}

public static void indexDocs(IndexWriter writer, File file)
throws Exception
{
if (file.isDirectory())
{
String[] files = file.list();
for (int i = 0; i < files.length; i++)
indexDocs(writer, new File(file, files[i]));
}
else
{
System.out.println("adding " + file);
XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file);
writer.addDocument(hdlr.getDocument());
// For DOM, use
// XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM();
// writer.addDocument(hdlr.createXMLDocument(file));
}
}
}



1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java

Index: SearchFiles.java
===================================================================
package org.apache.lucenesandbox.xmlindexingdemo;

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;

class SearchFiles {
public static void main(String[] args) {
try {
Searcher searcher = new IndexSearcher("index");
Analyzer analyzer = new StandardAnalyzer();

BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();

if (line.length() == -1)
break;

Query query = QueryParser.parse(line, "name", analyzer);
System.out.println("Searching for: " + query.toString("name"));

Hits hits = searcher.search(query);
System.out.println(hits.length() + " total matching documents");

final int HITS_PER_PAGE = 10;
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE)
{
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++)
{
Document doc = hits.doc(i);
String name = doc.get("name");
System.out.println(name);
System.out.println(doc.get("profession"));
System.out.println(doc.get("addressLine1"));
System.out.println(doc.get("addressLine2"));
System.out.print(doc.get("city"));
System.out.print(" ");
System.out.print(doc.get("state"));
System.out.print(" ");
System.out.print(doc.get("zip"));
System.out.println(doc.get("country"));

}

if (hits.length() > end) {
System.out.print("more (y/n) ? ");
line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n')
break;
}
}
}
searcher.close();

} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
}



1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java

Index: XMLDocumentHandlerDOM.java
===================================================================
package org.apache.lucenesandbox.xmlindexingdemo;

import org.w3c.dom.*;
import org.w3c.dom.Node;
import javax.xml.parsers.*;
import org.apache.lucene.document.Field;

import java.io.File;

/**
*
*/
public class XMLDocumentHandlerDOM
{
public org.apache.lucene.document.Document createXMLDocument(File f)
{
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try
{
DocumentBuilder df = dbf.newDocumentBuilder();
org.w3c.dom.Document d = df.parse(f);
Node root = d.getDocumentElement();
traverseTree(root, document);
}
catch (Exception e)
{
System.out.println("error: " + e);
e.printStackTrace();
}
return document;
}

static private void traverseTree(Node node, org.apache.lucene.document.Document document)
{
NodeList nl = node.getChildNodes();
if (nl.getLength() == 0)
{
if (node.getNodeType() == Node.TEXT_NODE)
{
Node parentNode = node.getParentNode();
if (parentNode.getNodeType() == Node.ELEMENT_NODE)
{
String parentNodeName = parentNode.getNodeName();
// String nodeValue = node.getNodeValue();
// if (parentNodeName.equals("name"))
// {
Node siblingNode = node.getNextSibling();
if (siblingNode != null)
{
if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
{
document.add(Field.Text("name", siblingNode.getNodeValue()));
}
}
// }
// else if (parentNodeName.equals("profession"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text([.arentNodeName, siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName == "addressLine1")
// {
// Node siblingNode = node.getNextSibling();
// if(siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("addressLine1", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("addressLine2"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("addressLine2", siblingNode.getNodeValue()));
// }
// }
// }
// if (parentNodeName.equals("city"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("city", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("zip"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("zip", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("state"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("state", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("country"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("country", siblingNode.getNodeValue()));
// }
// }
// }
}
}
}
else
{
for(int i=0; i<nl.getLength(); i++)
{
traverseTree(nl.item(i), document);
}
}
}
}



1.1 jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java

Index: XMLDocumentHandlerSAX.java
===================================================================
package org.apache.lucenesandbox.xmlindexingdemo;

import org.xml.sax.*;
import org.xml.sax.helpers.*;
import org.xml.sax.AttributeList;
import javax.xml.parsers.*;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.File;
import java.io.IOException;

public class XMLDocumentHandlerSAX
extends HandlerBase
{
/** A buffer for each XML element */
private StringBuffer elementBuffer = new StringBuffer();

private Document mDocument;

// constructor
public XMLDocumentHandlerSAX(File xmlFile)
throws ParserConfigurationException, SAXException, IOException
{
SAXParserFactory spf = SAXParserFactory.newInstance();

SAXParser parser = spf.newSAXParser();
parser.parse(xmlFile, this);
}

// call at document start
public void startDocument()
{
mDocument = new Document();
}

// call at element start
public void startElement(String localName, AttributeList atts)
throws SAXException
{
elementBuffer.setLength(0);
}

// call when cdata found
public void characters(char[] text, int start, int length)
{
elementBuffer.append(text, start, length);
}

// call at element end
public void endElement(String localName)
throws SAXException
{
mDocument.add(Field.Text(localName, elementBuffer.toString()));
}

public Document getDocument()
{
return mDocument;
}
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>