[转载]lucene-3.5.0 IKIKAnalyzer3.2.5Stable_bin 中文分词建立索引和检索 – 裴东辉 – 博客园.
基本环境:
ext_stopword.dic和IKAnalyzer.cfg.xml放到classpath下面
引入jar包:IKAnalyzer3.2.5Stable.jar lucene-core-3.5.0.jar
基本程序:
1、中文分词建立索引
package testlucene.index;
import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import testlucene.util.FileUtil;
//import org.apache.lucene.search.IndexSearcher;
public class TxtFileIndexer {
public static void main(String args[]) throws Exception {
//索引位置
File indexDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index”);
//数据位置
File dataDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/data”);
//使用IKAnalyzer中文分词工具
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,new IKAnalyzer());
FSDirectory directory = FSDirectory.open(indexDir);
IndexWriter indexWriter = new IndexWriter(directory, iwc);
//接着程序遍历了目录下面的所有文本文档,并为每一个文本文档创建了一个 Document 对象。
//然后把文本文档的两个属性:路径和内容加入到了两个 Field 对象中,接着在把这两个 Field 对象加入到 Document 对象中
//最后把这个文档用 IndexWriter 类的 add 方法加入到索引中去。
//这样我们便完成了索引的创建。接下来我们进入在建立好的索引上进行搜索的部分。
File[] dataFiles = dataDir.listFiles();
String name=””,path=””,content=””;
for(File file:dataFiles){
if(file.isFile() && file.getName().endsWith(“.txt”)){
System.out.println(“Indexing file ” + file.getCanonicalPath());
/*Step 1. Prepare the data for indexing. Extract the data. */
name =file.getName();
path=file.getCanonicalPath();
content=FileUtil.parsefiletostring(file);
/*Step 2. Wrap the data in the Fields and add them to a Document */
Document doc = new Document();doc.add(new Field(“name”,name,Field.Store.YES,Field.Index.NOT_ANALYZED));doc.add(new Field(“path”,path,Field.Store.YES,Field.Index.NOT_ANALYZED));doc.add(new Field(“content”,content,Field.Store.NO,Field.Index.ANALYZED));
/*Step 3: Add this document to Lucene Index.*/
indexWriter.addDocument(doc);
}
}
indexWriter.close();
}
}
2、中文分词 搜索
package testlucene.search;
import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
@SuppressWarnings(“resource”)
public class TxtFileSearcher {
public static void main(String[] args) {
try{
//index索引位置生成Reader流
File indexDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index”);
FSDirectory directory = FSDirectory.open(indexDir);
IndexReader reader = IndexReader.open(directory);
//创建Searcher
IndexSearcher indexSearcher = new IndexSearcher(reader);
indexSearcher.setSimilarity(new IKSimilarity());
//查询关键词
String keyWords = “是一个开放源代码的全文检索引擎工具包 “;
//IKAnalyzer中文分词生成查询
Query query = IKQueryParser.parse(“content”, keyWords);
TopDocs topDocs = indexSearcher.search(query, Integer.MAX_VALUE);
System.out.println(topDocs.totalHits);
//对获取到的文档进行解析输出
ScoreDoc[] scoreDosArray = topDocs.scoreDocs;
for(ScoreDoc scoredoc: scoreDosArray){
Document doc = indexSearcher.doc(scoredoc.doc);
System.out.println(“name: “+doc.getFieldable(“name”).stringValue());
System.out.println(“path: “+doc.getFieldable(“path”).stringValue());
}
}catch(Exception e){
e.printStackTrace();
}
}
}
3、工具类
package testlucene.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
public class FileUtil {
/**
* parse the file to string
* @param file
* @return
*/
public static String parsefiletostring(File file) {
String strresult=””,tmp=””;
BufferedReader br=null;
try{
br=new BufferedReader(new FileReader(file));
while((tmp=br.readLine())!=null){
strresult+=tmp;
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(null!=br){
try {br.close();} catch (IOException e) {e.printStackTrace();}
}
}
System.out.println(strresult);
return strresult;
}
}