Mailing List Archive

term vector implmentation
I haven't been able to find any code examples of implementing term vectors.
"Lucene in Action" has a nice intro to the topic, but skips some of the
details and doesn't address how cos theta is computed in an n-dimensional
situation (which I realize may be simple to those familiar with the math -
sorry for my ignorance).



Does anyone have a complete working example of the use of term vectors they
could share? Also, does anyone have benchmarks for doing these computations
on a large (millions of records) database? I am wondering how intractable
this will be to do in real-time.



James
Re: term vector implmentation [ In reply to ]
Hi,

I have same problem. could you help me Regarding TermVectorPosition in
Lucene. I am Searching a word as Query from IndexDir and display the number
of documents found that holds the Query.
kindlly guide me how to display the position of word in found document using
TermVectorPostion.

My source Code files are given below
1- Indexer.java class is used to index the raw data so that we can make it
searchable using lucene library.
2- LuceneConstants.java class is used to provide various constants to be
used across the sample application.
3- Searcher.java class is used to search the indexes created by Indexer to
search the requested contents.
4- TextFileFilter.java class is used as a .txt file filter.
5- LuceneTester.java lass is used to test the indexing and search capability
of lucene library.


LuceneConstants.java class is used to provide various constants to be used
across the sample application.

public class LuceneConstants {
public static final String CONTENTS="contents";
public static final String FILE_NAME="filename";
public static final String FILE_PATH="filepath";
public static final int MAX_SEARCH = 10;
}

TextFileFilter.java class is used as a .txt file filter.

import java.io.File;
import java.io.FileFilter;

public class TextFileFilter implements FileFilter {

@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".txt");
}
}

Indexer.java class is used to index the raw data so that we can make it
searchable using lucene library.
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {

private IndexWriter writer;

public Indexer(String indexDirectoryPath) throws IOException{
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(new File(indexDirectoryPath));

//create the indexer
writer = new IndexWriter(indexDirectory,
new StandardAnalyzer(Version.LUCENE_36),true,
IndexWriter.MaxFieldLength.UNLIMITED);
}

public void close() throws CorruptIndexException, IOException{
writer.close();
}

private Document getDocument(File file) throws IOException{
Document document = new Document();

//index file contents
Field contentField = new Field(LuceneConstants.CONTENTS,
new FileReader(file));
//index file name
Field fileNameField = new Field(LuceneConstants.FILE_NAME,
file.getName(),
Field.Store.YES,Field.Index.NOT_ANALYZED);
//index file path
Field filePathField = new Field(LuceneConstants.FILE_PATH,
file.getCanonicalPath(),
Field.Store.YES,Field.Index.NOT_ANALYZED);

document.add(contentField);
document.add(fileNameField);
document.add(filePathField);

return document;
}

private void indexFile(File file) throws IOException{
System.out.println("Indexing "+file.getCanonicalPath());
Document document = getDocument(file);
writer.addDocument(document);
}

public int createIndex(String dataDirPath, FileFilter filter)
throws IOException{
//get all files in the data directory
File[] files = new File(dataDirPath).listFiles();

for (File file : files) {
if(!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
){
indexFile(file);
}
}
return writer.numDocs();
}
}

Searcher.java class is used to search the indexes created by Indexer to
search the requested contents.

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;

public Searcher(String indexDirectoryPath)
throws IOException{
Directory indexDirectory =
FSDirectory.open(new File(indexDirectoryPath));
indexSearcher = new IndexSearcher(indexDirectory);
queryParser = new QueryParser(Version.LUCENE_36,
LuceneConstants.CONTENTS,
new StandardAnalyzer(Version.LUCENE_36));
}

public TopDocs search( String searchQuery)
throws IOException, ParseException{
query = queryParser.parse(searchQuery);
return indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
}

public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException{
return indexSearcher.doc(scoreDoc.doc);
}

public void close() throws IOException{
indexSearcher.close();
}
}


LuceneTester.java class is used to test the indexing and search capability
of lucene library.
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

public class LuceneTester {

String indexDir = "E:\\Lucene\\Index";
String dataDir = "E:\\Lucene\\Data";
Indexer indexer;
Searcher searcher;

public static void main(String[] args) {
LuceneTester tester;
try {
tester = new LuceneTester();
tester.createIndex();
tester.search("Mohan");
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
}

private void createIndex() throws IOException{
indexer = new Indexer(indexDir);
int numIndexed;
long startTime = System.currentTimeMillis();
numIndexed = indexer.createIndex(dataDir, new TextFileFilter());
long endTime = System.currentTimeMillis();
indexer.close();
System.out.println(numIndexed+" File indexed, time taken: "
+(endTime-startTime)+" ms");
}

private void search(String searchQuery) throws IOException,
ParseException{
searcher = new Searcher(indexDir);
long startTime = System.currentTimeMillis();
TopDocs hits = searcher.search(searchQuery);
long endTime = System.currentTimeMillis();

System.out.println(hits.totalHits +
" documents found. Time :" + (endTime - startTime));
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.getDocument(scoreDoc);
System.out.println("File: "
+ doc.get(LuceneConstants.FILE_PATH));
}
searcher.close();
}
}


BestRegards




--
View this message in context: http://lucene.472066.n3.nabble.com/term-vector-implmentation-tp642127p4242493.html
Sent from the Lucene - General mailing list archive at Nabble.com.