Mailing List Archive

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage LuceneStorage.java LogStorage.java SQLServerStorage.java StoragePipeline.java
cmarschner 2002/06/17 17:44:22

Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/storage
LogStorage.java SQLServerStorage.java
StoragePipeline.java
Added: contributions/webcrawler-LARM/src/de/lanlab/larm/storage
LuceneStorage.java
Log:
added experimental version of a LuceneStorage

Revision Changes Path
1.4 +2 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java

Index: LogStorage.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- LogStorage.java 1 Jun 2002 18:55:16 -0000 1.3
+++ LogStorage.java 18 Jun 2002 00:44:22 -0000 1.4
@@ -201,9 +201,9 @@
public WebDocument store(WebDocument doc)
{
String docInfo = doc.getInfo();
- if (logContents && isValid && doc.getDocumentBytes() != null)
+ if (logContents && isValid && doc.getField("content") != null)
{
- int offset = writeToPageFile(doc.getDocumentBytes());
+ int offset = writeToPageFile((byte[])doc.getField("content"));
docInfo = docInfo + "\t" + pageFileCount + "\t" + offset;
}
log.logThreadSafe(docInfo);



1.4 +2 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java

Index: SQLServerStorage.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- SQLServerStorage.java 1 Jun 2002 18:55:16 -0000 1.3
+++ SQLServerStorage.java 18 Jun 2002 00:44:22 -0000 1.4
@@ -172,7 +172,7 @@
conn = getConnection();
Statement delDoc = conn.createStatement();

- // bisherige Daten löschen, indem die Tabelle neu angelegt wird (geht schneller)
+ // recreate table (faster than delete from table)

delDoc.executeUpdate("if exists (select * from sysobjects where id = object_id(N'[dbo].[Document]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)drop table [dbo].[Document]");
delDoc.executeUpdate("CREATE TABLE [dbo].[Document] ([DO_ID] [int] IDENTITY (1, 1) NOT NULL , [DA_CrawlPass] [int] NULL , [DO_URL] [varchar] (255) NULL , [DO_ContentType] [varchar] (50) NULL , [DO_Data] [text] NULL , [DO_Hashcode] [int] NULL , [DO_ContentLength] [int] NULL , [DO_ContentEncoding] [varchar] (20) NULL , [DO_Data2] [image] NULL, [DO_MimeType] [varchar] (255) NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]"); // löschen
@@ -206,7 +206,7 @@
addDoc = getStatement();
addDoc.setString(1, document.getURLString());
addDoc.setString(2, document.getMimeType());
- addDoc.setBytes(3, document.getDocumentBytes());
+ addDoc.setBytes(3, (byte[])document.getField("content"));
addDoc.execute();
}
catch(SQLException e)



1.2 +1 -0 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/StoragePipeline.java

Index: StoragePipeline.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/StoragePipeline.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- StoragePipeline.java 1 Jun 2002 18:55:16 -0000 1.1
+++ StoragePipeline.java 18 Jun 2002 00:44:22 -0000 1.2
@@ -95,6 +95,7 @@
{
for (Iterator it = docStorages.iterator(); it.hasNext(); )
{
+ System.out.println("opening...");
((DocumentStorage) it.next()).open();
}
isOpen = true;



1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LuceneStorage.java

Index: LuceneStorage.java
===================================================================
package de.lanlab.larm.storage;

import de.lanlab.larm.util.WebDocument;

/**
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
*
* @author
* @version 1.0
*/
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import java.util.*;
import java.io.*;

/**
* Description of the Class
*
* @author Administrator
* @created 14. Juni 2002
*/
public class LuceneStorage implements DocumentStorage
{

HashMap fieldInfos = new HashMap();
IndexWriter writer;
Analyzer analyzer;
String indexName;


/**
* Constructor for the LuceneStorage object
*/
public LuceneStorage() { }


/**
* Sets the analyzer attribute of the LuceneStorage object
*
* @param a The new analyzer value
*/
public void setAnalyzer(Analyzer a)
{
this.analyzer = a;
}


/**
* Sets the indexName attribute of the LuceneStorage object
*
* @param name The new indexName value
*/
public void setIndexName(String name)
{
this.indexName = name;
}


/**
* Sets the fieldInfo attribute of the LuceneStorage object
*
* @param fieldName The new fieldInfo value
* @param value The new fieldInfo value
*/
public void setFieldInfo(String fieldName, int value)
{
fieldInfos.put(fieldName, new Integer(value));
}


/**
* Sets the create attribute of the LuceneStorage object
*
* @param create The new create value
*/
public void setCreate(boolean create)
{
this.create = create;
}

boolean create;


/**
* Description of the Method
*/
public void open()
{
System.out.println("opening Lucene storage with index name " + indexName + ")");
try
{
writer = new IndexWriter(indexName, analyzer, create);
}
catch(IOException e)
{
System.err.println("IOException occured when opening Lucene Index with index name '" + indexName + "'");
e.printStackTrace();
}
if(writer != null)
{
System.out.println("lucene storage opened successfully");
}
}


public final static int INDEX = 1;
public final static int STORE = 2;
public final static int TOKEN = 4;


/**
* Gets the fieldInfo attribute of the LuceneStorage object
*
* @param fieldName Description of the Parameter
* @param def Description of the Parameter
* @return The fieldInfo value
*/
protected int getFieldInfo(String fieldName, int def)
{
Integer info = (Integer) fieldInfos.get(fieldName);
if (info != null)
{
return info.intValue();
}
else
{
return def;
}
}


protected void addField(Document doc, String name, String value, int defaultIndexFlags)
{
int flags = getFieldInfo(name, defaultIndexFlags);
if (flags != 0)
{
doc.add(new Field(name, value, (flags & STORE) != 0, (flags & INDEX) != 0, (flags & TOKEN) != 0));
}
}

/**
* Description of the Method
*
* @param webDoc Description of the Parameter
* @return Description of the Return Value
*/
public WebDocument store(WebDocument webDoc)
{
//System.out.println("storing " + webDoc.getUrl());
boolean store;
boolean index;
boolean token;
store = index = token = false;

Document doc = new Document();
int flags;

addField(doc, "url", webDoc.getUrl().toExternalForm(), STORE | INDEX);
addField(doc, "mimetype", webDoc.getMimeType(), STORE | INDEX);
// addField(doc, "...", webDoc.getNormalizedURLString(), STORE | INDEX); and so fortg
// todo: other fields
Set fields = webDoc.getFieldNames();

for (Iterator it = fields.iterator(); it.hasNext(); )
{
String fieldName = (String) it.next();
Object field = webDoc.getField(fieldName);

if (field instanceof char[])
{
addField(doc, fieldName, new String((char[]) field), STORE | INDEX);
}
else if (field instanceof String)
{
addField(doc, fieldName, (String)field, STORE | INDEX);
}
/* else ? */
}
try
{
writer.addDocument(doc);
}
catch(IOException e)
{
System.err.println("IOException occured when adding document to Lucene index");
e.printStackTrace();
}
return webDoc;
}

//public void set
}




--
To unsubscribe, e-mail: <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>