Mailing List Archive

cvs commit: jakarta-lucene-sandbox/projects/appex/src/java/search AbstractDataSource.java DataSource.java DocumentHandler.java FSDataSource.java IllegalConfigurationException.java SearchConfiguration.java SearchIndexer.java
kelvint 02/05/08 08:52:23

Modified: projects/appex/src/java/search AbstractDataSource.java
DataSource.java DocumentHandler.java
FSDataSource.java
IllegalConfigurationException.java
SearchConfiguration.java SearchIndexer.java
Log:
Importing the classes seem to have warped the whitespaces. Here's my attempt to get things back to normal.

Introduced new datasource and contenthandler mechanism. It's quite a major alteration for individual changes to be enumerated.

Revision Changes Path
1.2 +87 -74 jakarta-lucene-sandbox/projects/appex/src/java/search/AbstractDataSource.java

Index: AbstractDataSource.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/AbstractDataSource.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- AbstractDataSource.java 4 May 2002 15:43:46 -0000 1.1
+++ AbstractDataSource.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,75 +1,88 @@
-package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-/**
- * Generic implementation of a datasource.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public abstract class AbstractDataSource implements DataSource
-{
- protected SearchConfiguration config;
-
- public AbstractDataSource(SearchConfiguration config)
- {
- this.config = config;
- }
-
- public SearchConfiguration getConfig()
- {
- return this.config;
- }
+package search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache POI" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Generic implementation of a datasource.
+ */
+public abstract class AbstractDataSource implements DataSource
+{
+ protected AbstractDataSource()
+ {
+ }
+
+ protected AbstractDataSource(Map map)
+ {
+ loadFields(map);
+ }
+
+ /**
+ * Fields to index.
+ */
+ protected String[] fields;
+
+ /**
+ * Convenience method to load fields to index into a Map.
+ */
+ protected void loadFields(Map map)
+ {
+ Set fieldSet = map.keySet();
+ fields = new String[fieldSet.size()];
+ fieldSet.toArray(fields);
+ }
}



1.2 +99 -79 jakarta-lucene-sandbox/projects/appex/src/java/search/DataSource.java

Index: DataSource.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/DataSource.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DataSource.java 4 May 2002 15:43:46 -0000 1.1
+++ DataSource.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,81 +1,101 @@
package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-import java.util.List;
-
-/**
- * A datasource is any source of data (filesystem, database, URL, etc)
- * which is indexed by SearchIndexer.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public interface DataSource
-{
- public static final String OBJECT_CLASS = "objectClass";
- public static final String OBJECT_IDENTIFIER = "objectid";
-
- /**
- * Retrieve a list of Maps. Each map represents the
- * a document to be indexed. The key:value pair of the map
- * is the data of the document.
- */
- public List getData() throws Exception;
-
- /**
- * Obtain the SearchConfiguration object used to configure the datasource.
- */
- public SearchConfiguration getConfig();
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache POI" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.util.Map;
+
+/**
+ * A datasource is any source of data (filesystem, database, URL, etc)
+ * which is indexed by SearchIndexer.
+ */
+public interface DataSource
+{
+ /**
+ * Key in the map (located in the list returned by getData)
+ * to represent the class name of the object being indexed.
+ */
+ public static final String OBJECT_CLASS = "objectClass";
+
+ /**
+ * Key in the map (located in the list returned by getData)
+ * to represent the uuid of the object being indexed.
+ */
+ public static final String OBJECT_IDENTIFIER = "objectId";
+
+ /**
+ * The key in the map (located in the list returned by getData)
+ * to represent nested datasources.
+ */
+ public static final String NESTED_DATASOURCE = "nestedDataSource";
+
+ /**
+ * Key in the map (located in the list returned by getData)
+ * to represent the id of the datasource's container. Applies to
+ * nested datasources.
+ */
+ public static final String CONTAINER_IDENTIFIER = "containerId";
+
+ /**
+ * Key in the map to represent the class name of the Search Result
+ * object for this datasource (if any).
+ */
+ public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
+
+ /**
+ * Retrieve a array of Maps. Each map represents the
+ * a document to be indexed. The key:value pair of the map
+ * is the metadata of the document.
+ */
+ public Map[] getData() throws Exception;
}



1.2 +317 -231 jakarta-lucene-sandbox/projects/appex/src/java/search/DocumentHandler.java

Index: DocumentHandler.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/DocumentHandler.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DocumentHandler.java 4 May 2002 15:43:46 -0000 1.1
+++ DocumentHandler.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,231 +1,317 @@
-package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-import org.apache.log4j.Category;
-import org.apache.lucene.document.DateField;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexWriter;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import search.util.IOUtils;
-import search.contenthandler.FileContentHandler;
-import search.contenthandler.ContentHandlerFactory;
-
-/**
- * <p>
- * A document is the atomic unit used for indexing purposes. It consists of
- * metadata as well as its file contents. File contents are handled by {@link FileContentHandler}.
- * </p>
- * <p>
- * DocumentHandler creates the {@link org.apache.lucene.document.Document},
- * adds the standard fields to it, delegates to {@link FileContentHandler} to handle
- * file contents, then adds to the {@link org.apache.lucene.index.IndexWriter}.
- * </p>
- * <p>
- * The standard fields are:<br>
- * <ul>
- * <li>filePath : Full filesystem path to the document
- * <li>fileName : File name of the document
- * <li>fileLastModifiedDate : Date the file was last modified
- * <li>fileSize : Size of the file in bytes
- * <li>fileFormat : Extension of the file {@see com.marketingbright.core.util.IOUtils#getFileExtension}
- * </ul>
- * </p>
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public class DocumentHandler
-{
- public static final String[] STANDARD_SEARCH_FIELDS =
- {"filePath", "fileName", "fileLastModifiedDate", "fileSize", "fileFormat"};
- private static Category cat = Category.getInstance(DocumentHandler.class.getName());
- private static Map customFields;
- private static final String EMPTY_STRING = "";
-
- /**
- * Document object this DocumentHandler is handling.
- */
- private Document doc;
-
- /**
- * Parent Document (null if none).
- */
- private Document parentDoc;
-
- /**
- * IndexWriter to add this document to.
- */
- private IndexWriter writer;
-
- public static void setCustomFields(Map aCustomFields)
- {
- customFields = aCustomFields;
- }
-
- public DocumentHandler(IndexWriter writer)
- {
- this.writer = writer;
- doc = new Document();
- }
-
- public DocumentHandler(IndexWriter writer, Document parentDoc)
- {
- this(writer);
- this.parentDoc = parentDoc;
- }
-
- public void process(Map metadata) throws IOException
- {
- File contentFile = new File((String) metadata.get("filePath"));
-
- // add the standard fields
- doc.add(Field.Keyword("filePath", contentFile.toString()));
- doc.add(Field.Text("fileName", contentFile.getName()));
- doc.add(Field.Keyword("fileLastModifiedDate", DateField.timeToString(contentFile.lastModified())));
- doc.add(Field.Keyword("fileSize", String.valueOf(contentFile.length())));
- doc.add(Field.Text("fileFormat", IOUtils.getFileExtension(contentFile)));
-
- // check if this is a document from datasource where
- // custom fields need to be added
- if (parentDoc == null)
- {
- // add the custom fields
- for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
- {
- String field = (String) it.next();
- String value = (String) metadata.get(field);
- String type = (String) customFields.get(field);
- addFieldToDoc(type, field, value);
- }
- // Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
- // to populate the result templates with the proper
- // objects
- doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
- (String) metadata.get(DataSource.OBJECT_CLASS)));
- doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
- (String) metadata.get(DataSource.OBJECT_IDENTIFIER)));
- }
- else
- {
- for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
- {
- String field = (String) it.next();
- String value = parentDoc.get(field);
- String type = (String) customFields.get(field);
- addFieldToDoc(type, field, value);
- }
- // Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
- // to populate the result templates with the proper
- // objects
- doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
- parentDoc.get(DataSource.OBJECT_CLASS)));
- doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
- parentDoc.get(DataSource.OBJECT_IDENTIFIER)));
- }
- if (!metadata.containsKey("fileContents"))
- {
- String extension = IOUtils.getFileExtension(contentFile);
- FileContentHandler cHandler = ContentHandlerFactory.getContentHandler(extension);
- if (cHandler != null)
- {
- cHandler.parse(doc, contentFile);
- if (cHandler.isNested())
- {
- List nestedData = cHandler.getNestedData();
- cat.debug("Nested data list size:" + nestedData.size());
- for (int i = 0; i < nestedData.size(); i++)
- {
- Map dataMap = (Map) nestedData.get(i);
- DocumentHandler handler = new DocumentHandler(writer, doc);
- handler.process(dataMap);
- }
- }
- }
- else
- {
- cat.warn("FileContentHandler not found for " + contentFile.getName());
- }
- }
- else
- doc.add(Field.Text("fileContents", (String) metadata.get("fileContents")));
- addToWriter();
- }
-
- public void addToWriter() throws IOException
- {
- writer.addDocument(this.doc);
- }
-
- private void addFieldToDoc(String type, String field, String value)
- {
- if (value == null)
- value = EMPTY_STRING;
- if (type.equalsIgnoreCase(SearchConfiguration.TEXT_FIELD_TYPE))
- doc.add(Field.Text(field, value));
- else if (type.equalsIgnoreCase(SearchConfiguration.KEYWORD_FIELD_TYPE))
- doc.add(Field.Keyword(field, value));
- else if (type.equalsIgnoreCase(SearchConfiguration.UNINDEXED_FIELD_TYPE))
- doc.add(Field.UnIndexed(field, value));
- else if (type.equalsIgnoreCase(SearchConfiguration.UNSTORED_FIELD_TYPE))
- doc.add(Field.UnStored(field, value));
- }
-}
+package search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.log4j.Category;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import search.util.StringUtils;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.*;
+
+/**
+ * <p>
+ * A document is the atomic unit used for indexing purposes. It consists of
+ * metadata as well as its file contents. File contents are handled by
+ * {@link ContentHandler}.
+ * </p>
+ * <p>
+ * DocumentHandler creates the {@link org.apache.lucene.document.Document},
+ * adds fields to it, delegates to {@link ContentHandler} to handle
+ * file contents.
+ * </p>
+ */
+public class DocumentHandler
+{
+ /**
+ * Field to retrieve all documents.
+ */
+ public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
+
+ private static Category cat = Category.getInstance(DocumentHandler.class);
+
+ private static boolean isDebugEnabled = cat.isDebugEnabled();
+
+ /**
+ * Should parent documents include data of its children?
+ */
+ private static boolean parentEncapsulation = false;
+ /**
+ * Document object this DocumentHandler is handling.
+ */
+ private Document doc;
+
+ /**
+ * Map of metadata for this document. Contains the field:value pair
+ * to be added to the document.
+ */
+ private Map metadata;
+
+ /**
+ * Map of fields. Contains field:type_of_field pair.
+ */
+ private Map customFields;
+
+ /**
+ * IndexWriter.
+ */
+ private IndexWriter writer;
+
+ /**
+ * A collection of documents to be added to the writer.
+ */
+ private List documents = new ArrayList();
+
+ /**
+ * Ctor.
+ *
+ * @param Map of metadata for this document.
+ * @param Map of fields.
+ * @param Writer.
+ */
+ public DocumentHandler(Map metadata,
+ Map customFields,
+ IndexWriter writer)
+ {
+ this.metadata = metadata;
+ this.customFields = customFields;
+ this.writer = writer;
+ }
+
+ /**
+ * Handles the actual processing of the document.
+ */
+ public void process() throws IOException, Exception
+ {
+ String objectid = (String) metadata.get(DataSource.OBJECT_IDENTIFIER);
+ if (objectid == null)
+ return;
+ doc = createDocument();
+ addMapToDoc(metadata);
+ addNestedDataSource(metadata);
+ doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
+ //documents.add(doc);
+ if (writer != null)
+ {
+ addToWriter();
+ }
+ else
+ {
+ documents.add(doc);
+ }
+ }
+
+ private List getDocuments()
+ {
+ return documents;
+ }
+
+ private Document createDocument()
+ {
+ return new Document();
+ }
+
+ /**
+ * Add the contents of a Map to a document.
+ *
+ * @param Map to add.
+ */
+ private void addMapToDoc(Map map)
+ {
+ for (Iterator it = map.keySet().iterator(); it.hasNext();)
+ {
+ String field = (String) it.next();
+ Object value = map.get(field);
+ if (value instanceof String)
+ {
+ String type = null;
+ if (customFields != null)
+ {
+ type = (String) customFields.get(field);
+ }
+ addFieldToDoc(type, field, (String) value);
+ }
+ else if (value instanceof Reader)
+ {
+ addFieldToDoc(field, (Reader) value);
+ }
+ }
+ }
+
+ /**
+ * Add nested datasources.
+ *
+ * @param Map which contains the nested datasources.
+ */
+ private void addNestedDataSource(Map map) throws Exception
+ {
+ Object o = map.get(DataSource.NESTED_DATASOURCE);
+ if (o == null)
+ return;
+ if (o instanceof List)
+ {
+ List nestedDataSource = (List) o;
+ for (int i = 0; i < nestedDataSource.size(); i++)
+ {
+ DataSource ds = (DataSource) nestedDataSource.get(i);
+ addDataSource(ds);
+ }
+ }
+ else if (o instanceof DataSource)
+ {
+ DataSource ds = (DataSource) o;
+ addDataSource(ds);
+ }
+ }
+
+ /**
+ * Datasources are basically a collection of data maps to be indexed.
+ * addMapToDoc is invoked for each map.
+ *
+ * @param Datasource to add.
+ */
+ private void addDataSource(DataSource ds) throws Exception
+ {
+ Map[] data = ds.getData();
+ for (int i = 0; i < data.length; i++)
+ {
+ Map map = data[i];
+ if (map.containsKey(DataSource.OBJECT_IDENTIFIER))
+ {
+ /**
+ * Create a new document because child datasources may need
+ * to be retrieved independently of parent doc.
+ */
+ DocumentHandler docHandler = new DocumentHandler(map, null, null);
+ docHandler.process();
+ documents.addAll(docHandler.getDocuments());
+ }
+ else
+ {
+ addMapToDoc(map);
+ /**
+ * Add nested datasources of this datasource's data
+ */
+ addNestedDataSource(map);
+ }
+ }
+ }
+
+ /**
+ * Adds a String-based field to a document.
+ *
+ * @param Type of field.
+ * @param Name of field.
+ * @param Value of field.
+ */
+ private void addFieldToDoc(String type, String field, String value)
+ {
+ if (value == null)
+ value = StringUtils.EMPTY_STRING;
+ if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
+ doc.add(Field.Keyword(field, value));
+ else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
+ doc.add(Field.UnIndexed(field, value));
+ else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
+ doc.add(Field.UnStored(field, value));
+ else
+ doc.add(Field.Text(field, value));
+ }
+
+ /**
+ * Adds a Reader-based field to a document.
+ *
+ * @param Name of field.
+ * @param Reader.
+ */
+ private void addFieldToDoc(String field, Reader reader)
+ {
+ doc.add(Field.Text(field, reader));
+ }
+
+ /**
+ * Adds documents to the IndexWriter.
+ */
+ private void addToWriter() throws IOException
+ {
+ if (parentEncapsulation)
+ {
+ for (int i = 0; i < documents.size(); i++)
+ {
+ Document d = (Document) documents.get(i);
+ for (Enumeration e = d.fields(); e.hasMoreElements();)
+ {
+ Field f = (Field) e.nextElement();
+ String fieldName = f.name();
+ if (!fieldName.equals(DataSource.CONTAINER_IDENTIFIER)
+ && !fieldName.equals(DataSource.OBJECT_CLASS)
+ && !fieldName.equals(DataSource.OBJECT_IDENTIFIER))
+ {
+ doc.add(f);
+ }
+ }
+ }
+ }
+ writer.addDocument(doc);
+ for (int i = 0; i < documents.size(); i++)
+ {
+ writer.addDocument((Document) documents.get(i));
+ }
+ //cat.debug((documents.size() + 1) + " documents added.");
+ }
+}



1.2 +159 -109 jakarta-lucene-sandbox/projects/appex/src/java/search/FSDataSource.java

Index: FSDataSource.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/FSDataSource.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- FSDataSource.java 4 May 2002 15:43:46 -0000 1.1
+++ FSDataSource.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,109 +1,159 @@
-package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-import org.apache.lucene.document.DateField;
-import org.apache.lucene.document.Field;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A filesystem-based datasource.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public class FSDataSource extends AbstractDataSource
-{
- private File targetDirectory;
-
- public FSDataSource(SearchConfiguration config)
- {
- super(config);
- }
-
- public List getData()
- {
- List returnData = new ArrayList();
- loadDataFromFiles(targetDirectory, returnData);
- return returnData;
- }
-
- public void setTargetDirectory(File targetDirectory)
- {
- this.targetDirectory = targetDirectory;
- }
-
- private void loadDataFromFiles(File f, List list)
- {
- if (f.isDirectory())
- {
- File[] directoryTree = f.listFiles();
- for (int i = 0; i < directoryTree.length; i++)
- {
- loadDataFromFiles(directoryTree[i], list);
- }
- }
- else
- {
- Map dataMap = new HashMap();
- dataMap.put("filePath", f.getPath());
- list.add(dataMap);
- }
- }
-}
+package search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.lucene.document.DateField;
+import search.contenthandler.FileContentHandler;
+import search.contenthandler.FileContentHandlerFactory;
+import search.util.IOUtils;
+
+import java.io.File;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A filesystem-based datasource.
+ *
+ * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
+ */
+public class FSDataSource extends AbstractDataSource
+{
+ public static final String FILE_PATH_FIELD = "filePath";
+ public static final String FILE_NAME_FIELD = "fileName";
+ public static final String FILE_SIZE_FIELD = "fileSize";
+ public static final String FILE_FORMAT_FIELD = "fileFormat";
+ public static final String FILE_CONTENTS_FIELD = "fileContents";
+ public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
+
+ private File targetFileOrDir;
+
+ public FSDataSource(String targetFileOrDirStr)
+ {
+ this(new File(targetFileOrDirStr));
+ }
+
+ public FSDataSource(File targetFileOrDir)
+ {
+ setTargetDirectory(targetFileOrDir);
+ }
+
+ public Map[] getData()
+ {
+ Map[] returnData = null;
+ List temp = new ArrayList();
+ loadDataFromFiles(targetFileOrDir, temp);
+ returnData = new Map[temp.size()];
+ returnData = (Map[]) temp.toArray(returnData);
+ return returnData;
+ }
+
+ public void setTargetDirectory(File targetFileOrDir)
+ {
+ this.targetFileOrDir = targetFileOrDir;
+ }
+
+ private void loadDataFromFiles(File f, List list)
+ {
+ if (f.isDirectory())
+ {
+ File[] directoryTree = f.listFiles();
+ for (int i = 0; i < directoryTree.length; i++)
+ {
+ loadDataFromFiles(directoryTree[i], list);
+ }
+ }
+ else
+ {
+ Map dataMap = new HashMap();
+ dataMap.put(FILE_PATH_FIELD, f.getPath());
+ dataMap.put(FILE_NAME_FIELD, f.getName());
+ dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
+ DateField.timeToString(f.lastModified()));
+ dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
+ dataMap.put(FILE_FORMAT_FIELD,
+ IOUtils.getFileExtension(f));
+ addFileContents(f, dataMap);
+ list.add(dataMap);
+ }
+ }
+
+ private void addFileContents(File targetFile, Map dataMap)
+ {
+ FileContentHandler cHandler =
+ FileContentHandlerFactory.getContentHandler(targetFile);
+ if (cHandler != null)
+ {
+ if (cHandler.fileContentIsReadable())
+ {
+ Reader r = cHandler.getReader();
+ if (r != null)
+ {
+ dataMap.put(FILE_CONTENTS_FIELD, r);
+ }
+ }
+ if (cHandler.containsNestedData())
+ {
+ dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
+ }
+ }
+ else
+ {
+ //cat.warn("ContentHandler not found for " + contentFile.getName());
+ }
+ }
+}



1.2 +2 -2 jakarta-lucene-sandbox/projects/appex/src/java/search/IllegalConfigurationException.java

Index: IllegalConfigurationException.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/IllegalConfigurationException.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- IllegalConfigurationException.java 4 May 2002 15:43:46 -0000 1.1
+++ IllegalConfigurationException.java 8 May 2002 15:52:23 -0000 1.2
@@ -26,12 +26,12 @@
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
+ * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
+ * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED



1.2 +257 -257 jakarta-lucene-sandbox/projects/appex/src/java/search/SearchConfiguration.java

Index: SearchConfiguration.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/SearchConfiguration.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- SearchConfiguration.java 4 May 2002 15:43:46 -0000 1.1
+++ SearchConfiguration.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,257 +1,257 @@
-package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-import org.apache.log4j.Category;
-import org.jdom.Document;
-import org.jdom.Element;
-import org.jdom.input.SAXBuilder;
-import search.util.DataUnformatFilter;
-import search.contenthandler.ContentHandlerFactory;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.StringTokenizer;
-
-/**
- * Configures the indexing process using an XML file.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public class SearchConfiguration
-{
- public static final String TEXT_FIELD_TYPE = "text";
- public static final String KEYWORD_FIELD_TYPE = "keyword";
- public static final String UNINDEXED_FIELD_TYPE = "unindexed";
- public static final String UNSTORED_FIELD_TYPE = "unstored";
-
- /** Log4j category.
- */
- static Category cat = Category.getInstance(SearchConfiguration.class.getName());
-
- /**
- * Key in the config file to declare content handlers.
- */
- private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
-
- /**
- * Key in the config file to declare custom fields.
- */
- private static final String FIELD_KEY = "Search.Fields";
-
- /**
- * Map of content handlers.
- */
- private Map contentHandlers = new HashMap();
-
- /**
- * Map of (non-standard) custom fields to index.
- */
- private Map customFields = new HashMap();
-
- /**
- * Document object which represents the xml configuration file.
- */
- private Document doc;
-
- /**
- * Creates a new SearchConfiguration.
- *
- * @param configFile Name of the xml configuration file.
- */
- public SearchConfiguration(String configFile) throws IllegalConfigurationException
- {
- try
- {
- SAXBuilder builder = new SAXBuilder();
- DataUnformatFilter format = new DataUnformatFilter();
- builder.setXMLFilter(format);
- doc = builder.build(configFile);
- }
- catch (Exception e)
- {
- cat.error("Error creating XML parser:" + e.getMessage(), e);
- }
- loadContentHandlers();
- loadCustomFields();
- }
-
- public Map getContentHandlers()
- {
- return this.contentHandlers;
- }
-
- public Map getCustomFields()
- {
- return this.customFields;
- }
-
- /**
- * Loads the content handlers.
- */
- protected void loadContentHandlers() throws IllegalConfigurationException
- {
- String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
- String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
- if (extensions.length != handlers.length)
- throw new IllegalConfigurationException(
- "Illegal configuration of Search Content Handlers!");
- for (int i = 0; i < extensions.length; i++)
- {
- contentHandlers.put(extensions[i], generateObject(handlers[i]));
- }
- String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
- for (int i = 0; i < defaultExtension.length; i++)
- {
- if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
- {
- contentHandlers.put(ContentHandlerFactory.DEFAULT_HANDLER_KEY
- , generateObject(handlers[i]));
- }
- }
- }
-
- /**
- * Loads the custom fields to index.
- */
- protected void loadCustomFields() throws IllegalConfigurationException
- {
- String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
- String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
- if (fields.length != fieldtypes.length)
- throw new IllegalConfigurationException(
- "Illegal configuration of custom search fields!");
- for (int i = 0; i < fields.length; i++)
- {
- customFields.put(fields[i], fieldtypes[i]);
- }
- }
-
- /**
- * Return attribute values for all child nodes.
- */
- private String[] getChildPropertyAttributeValues(String parent,
- String attributeName)
- {
- String[] nodeName = parseNodeName(parent);
- Element element = doc.getRootElement();
- for (int i = 0; i < nodeName.length; i++)
- {
- element = element.getChild(nodeName[i]);
- if (element == null)
- {
- return new String[]{};
- }
- }
- List children = element.getChildren();
- int childCount = children.size();
- String[] childrenAttributeValue = new String[childCount];
- for (int i = 0; i < childCount; i++)
- {
- childrenAttributeValue[i] =
- ((Element) children.get(i)).getAttributeValue(attributeName);
- }
- return childrenAttributeValue;
- }
-
- /**
- * Node names are in the form "x.y.z". Returns a String array
- * representation of the node elements.
- */
- private String[] parseNodeName(String nodeName)
- {
- StringTokenizer st = new StringTokenizer(nodeName, ".");
- String[] nodeElements = new String[st.countTokens()];
- int i = 0;
- while (st.hasMoreTokens())
- {
- nodeElements[i] = st.nextToken();
- ++i;
- }
- return nodeElements;
- }
-
- /**
- * Utility method to return an object based on its class name.
- * The object needs to have a constructor which accepts no parameters.
- *
- * @param className Class name of object to be generated
- * @return Object
- */
- private static Object generateObject(String className)
- {
- Object o = null;
- try
- {
- Class c = Class.forName(className);
- o = c.newInstance();
- }
- catch (ClassNotFoundException cnfe)
- {
- cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
- }
- catch (InstantiationException ie)
- {
- cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
- }
- catch (IllegalAccessException iae)
- {
- cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
- }
- return o;
- }
-
-}
+package search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.log4j.Category;
+import org.jdom.Document;
+import org.jdom.Element;
+import org.jdom.input.SAXBuilder;
+import search.util.DataUnformatFilter;
+import search.contenthandler.FileContentHandlerFactory;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.StringTokenizer;
+
+/**
+ * Configures the indexing process using an XML file.
+ *
+ * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
+ */
+public class SearchConfiguration
+{
+ public static final String TEXT_FIELD_TYPE = "text";
+ public static final String KEYWORD_FIELD_TYPE = "keyword";
+ public static final String UNINDEXED_FIELD_TYPE = "unindexed";
+ public static final String UNSTORED_FIELD_TYPE = "unstored";
+
+ /** Log4j category.
+ */
+ static Category cat = Category.getInstance(SearchConfiguration.class.getName());
+
+ /**
+ * Key in the config file to declare content handlers.
+ */
+ private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
+
+ /**
+ * Key in the config file to declare custom fields.
+ */
+ private static final String FIELD_KEY = "Search.Fields";
+
+ /**
+ * Map of content handlers.
+ */
+ private Map contentHandlers = new HashMap();
+
+ /**
+ * Map of (non-standard) custom fields to index.
+ */
+ private Map customFields = new HashMap();
+
+ /**
+ * Document object which represents the xml configuration file.
+ */
+ private Document doc;
+
+ /**
+ * Creates a new SearchConfiguration.
+ *
+ * @param configFile Name of the xml configuration file.
+ */
+ public SearchConfiguration(String configFile) throws IllegalConfigurationException
+ {
+ try
+ {
+ SAXBuilder builder = new SAXBuilder();
+ DataUnformatFilter format = new DataUnformatFilter();
+ builder.setXMLFilter(format);
+ doc = builder.build(configFile);
+ }
+ catch (Exception e)
+ {
+ cat.error("Error creating XML parser:" + e.getMessage(), e);
+ }
+ loadContentHandlers();
+ loadCustomFields();
+ }
+
+ public Map getContentHandlers()
+ {
+ return this.contentHandlers;
+ }
+
+ public Map getCustomFields()
+ {
+ return this.customFields;
+ }
+
+ /**
+ * Loads the content handlers.
+ */
+ protected void loadContentHandlers() throws IllegalConfigurationException
+ {
+ String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
+ String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
+ if (extensions.length != handlers.length)
+ throw new IllegalConfigurationException(
+ "Illegal configuration of Search Content Handlers!");
+ for (int i = 0; i < extensions.length; i++)
+ {
+ contentHandlers.put(extensions[i], generateObject(handlers[i]));
+ }
+ String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
+ for (int i = 0; i < defaultExtension.length; i++)
+ {
+ if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
+ {
+ contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
+ , generateObject(handlers[i]));
+ }
+ }
+ }
+
+ /**
+ * Loads the custom fields to index.
+ */
+ protected void loadCustomFields() throws IllegalConfigurationException
+ {
+ String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
+ String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
+ if (fields.length != fieldtypes.length)
+ throw new IllegalConfigurationException(
+ "Illegal configuration of custom search fields!");
+ for (int i = 0; i < fields.length; i++)
+ {
+ customFields.put(fields[i], fieldtypes[i]);
+ }
+ }
+
+ /**
+ * Return attribute values for all child nodes.
+ */
+ private String[] getChildPropertyAttributeValues(String parent,
+ String attributeName)
+ {
+ String[] nodeName = parseNodeName(parent);
+ Element element = doc.getRootElement();
+ for (int i = 0; i < nodeName.length; i++)
+ {
+ element = element.getChild(nodeName[i]);
+ if (element == null)
+ {
+ return new String[]{};
+ }
+ }
+ List children = element.getChildren();
+ int childCount = children.size();
+ String[] childrenAttributeValue = new String[childCount];
+ for (int i = 0; i < childCount; i++)
+ {
+ childrenAttributeValue[i] =
+ ((Element) children.get(i)).getAttributeValue(attributeName);
+ }
+ return childrenAttributeValue;
+ }
+
+ /**
+ * Node names are in the form "x.y.z". Returns a String array
+ * representation of the node elements.
+ */
+ private String[] parseNodeName(String nodeName)
+ {
+ StringTokenizer st = new StringTokenizer(nodeName, ".");
+ String[] nodeElements = new String[st.countTokens()];
+ int i = 0;
+ while (st.hasMoreTokens())
+ {
+ nodeElements[i] = st.nextToken();
+ ++i;
+ }
+ return nodeElements;
+ }
+
+ /**
+ * Utility method to return an object based on its class name.
+ * The object needs to have a constructor which accepts no parameters.
+ *
+ * @param className Class name of object to be generated
+ * @return Object
+ */
+ private static Object generateObject(String className)
+ {
+ Object o = null;
+ try
+ {
+ Class c = Class.forName(className);
+ o = c.newInstance();
+ }
+ catch (ClassNotFoundException cnfe)
+ {
+ cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
+ }
+ catch (InstantiationException ie)
+ {
+ cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
+ }
+ catch (IllegalAccessException iae)
+ {
+ cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
+ }
+ return o;
+ }
+
+}



1.2 +125 -130 jakarta-lucene-sandbox/projects/appex/src/java/search/SearchIndexer.java

Index: SearchIndexer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/projects/appex/src/java/search/SearchIndexer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- SearchIndexer.java 4 May 2002 15:43:46 -0000 1.1
+++ SearchIndexer.java 8 May 2002 15:52:23 -0000 1.2
@@ -1,132 +1,127 @@
-package search;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Turbine" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
+package search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache POI" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
*/
-
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.log4j.Category;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import search.contenthandler.ContentHandlerFactory;
-
-/**
- * Entry point for search engine indexing.
- * <p>
- * SearchIndexer is responsible for creating the IndexWriter {@see org.apache.lucene.index.IndexWriter}
- * and passing it to DocumentHandlers {@link DocumentHandler} to index individual documents.
- * </p>
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
- */
-public class SearchIndexer
-{
- private static Category cat = Category.getInstance(SearchIndexer.class);
-
- private IndexWriter writer;
- private DataSource source;
- private int indexedDocuments = 0;
-
- public SearchIndexer() throws IOException
- {
- writer = new IndexWriter("/usr/local/lucene/index",
- new StandardAnalyzer(), true);
- }
-
- public void index() throws IOException, Exception
- {
- cat.debug("Initiating indexing...");
-
- init();
- List dataMapList = source.getData();
- for (int i = 0; i < dataMapList.size(); i++)
- {
- Map map = (Map) dataMapList.get(i);
- DocumentHandler docHandler = new DocumentHandler(writer);
- try
- {
- docHandler.process(map);
- ++indexedDocuments;
- }
- catch (IOException ioe)
- {
- cat.error("Error encountered indexing:" + ioe.getMessage(),
- ioe);
- }
- }
- writer.optimize();
- writer.close();
-
- cat.debug(indexedDocuments + " documents were indexed.");
- }
-
- public void setSource(DataSource source)
- {
- this.source = source;
- }
-
- public void init()
- {
- ContentHandlerFactory.setContentHandlers(source.getConfig().getContentHandlers());
- DocumentHandler.setCustomFields(source.getConfig().getCustomFields());
- }
-
- public int getIndexedDocuments()
- {
- return this.indexedDocuments;
- }
+import org.apache.log4j.Category;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.index.IndexWriter;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import search.contenthandler.FileContentHandlerFactory;
+
+/**
+ * Entry point for search engine indexing.
+ * <p>
+ * SearchIndexer is responsible for creating the IndexWriter
+ * {@see org.apache.lucene.index.IndexWriter} and passing it to
+ * DocumentHandlers {@link DocumentHandler} to index individual documents.
+ * </p>
+ */
+public class SearchIndexer
+{
+ private static Category cat = Category.getInstance(SearchIndexer.class);
+ private IndexWriter fsWriter;
+ private SearchConfiguration config;
+ private int indexedDocuments = 0;
+
+ public SearchIndexer() throws IOException
+ {
+ Analyzer a = new StandardAnalyzer();
+ String indexDirectory = "/usr/path/to/index";
+ fsWriter = new IndexWriter(indexDirectory, a, true);
+ fsWriter.maxFieldLength = 1000000;
+ }
+
+ /**
+ * Indexes documents.
+ */
+ public synchronized void index() throws IOException, Exception
+ {
+ cat.debug("Initiating search engine indexing...");
+ long start = System.currentTimeMillis();
+ loadConfig();
+ fsWriter.optimize();
+ fsWriter.close();
+ long stop = System.currentTimeMillis();
+ cat.debug("Indexing took " + (stop - start) + " milliseconds");
+ }
+
+ public int getIndexedDocuments()
+ {
+ return this.indexedDocuments;
+ }
+
+ private void loadConfig() throws IllegalConfigurationException
+ {
+ config = new SearchConfiguration("/path/to/config");
+ FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
+ }
+
+ private void indexDataSource(DataSource source, Map customFields)
+ throws Exception
+ {
+ Map[] data = source.getData();
+ // here's a good place to spawn a couple of threads for indexing
+ for (int i = 0; i < data.length; i++)
+ {
+ DocumentHandler docHandler =
+ new DocumentHandler(data[i], customFields, fsWriter);
+ docHandler.process();
+ }
+ }
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>