[转载]lucene-3.5.0 IKIKAnalyzer3.2.5Stable_bin 中文分词建立索引和检索 – 裴东辉

[转载]lucene-3.5.0 IKIKAnalyzer3.2.5Stable_bin 中文分词建立索引和检索 – 裴东辉 – 博客园.

基本环境：

ext_stopword.dic和IKAnalyzer.cfg.xml放到classpath下面

引入jar包：IKAnalyzer3.2.5Stable.jar lucene-core-3.5.0.jar

基本程序：

1、中文分词建立索引

package testlucene.index;

import java.io.File;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

import testlucene.util.FileUtil;

//import org.apache.lucene.search.IndexSearcher;

public class TxtFileIndexer {

public static void main(String args[]) throws Exception {

//索引位置

File indexDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index”);

//数据位置

File dataDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/data”);

//使用IKAnalyzer中文分词工具

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,new IKAnalyzer());

FSDirectory directory = FSDirectory.open(indexDir);

IndexWriter indexWriter = new IndexWriter(directory, iwc);

//接着程序遍历了目录下面的所有文本文档，并为每一个文本文档创建了一个 Document 对象。

//然后把文本文档的两个属性：路径和内容加入到了两个 Field 对象中，接着在把这两个 Field 对象加入到 Document 对象中

//最后把这个文档用 IndexWriter 类的 add 方法加入到索引中去。

//这样我们便完成了索引的创建。接下来我们进入在建立好的索引上进行搜索的部分。

File[] dataFiles = dataDir.listFiles();

String name=””,path=””,content=””;

for(File file:dataFiles){

if(file.isFile() && file.getName().endsWith(“.txt”)){

System.out.println(“Indexing file ” + file.getCanonicalPath());

/*Step 1. Prepare the data for indexing. Extract the data. */

name =file.getName();

path=file.getCanonicalPath();

content=FileUtil.parsefiletostring(file);

/*Step 2. Wrap the data in the Fields and add them to a Document */

Document doc = new Document();doc.add(new Field(“name”,name,Field.Store.YES,Field.Index.NOT_ANALYZED));doc.add(new Field(“path”,path,Field.Store.YES,Field.Index.NOT_ANALYZED));doc.add(new Field(“content”,content,Field.Store.NO,Field.Index.ANALYZED));

/*Step 3: Add this document to Lucene Index.*/

indexWriter.addDocument(doc);

}

indexWriter.close();

}

2、中文分词搜索

package testlucene.search;

import java.io.File;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.wltea.analyzer.lucene.IKQueryParser;

import org.wltea.analyzer.lucene.IKSimilarity;

@SuppressWarnings(“resource”)

public class TxtFileSearcher {

public static void main(String[] args) {

try{

//index索引位置生成Reader流

File indexDir = new File(“E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index”);

FSDirectory directory = FSDirectory.open(indexDir);

IndexReader reader = IndexReader.open(directory);

//创建Searcher

IndexSearcher indexSearcher = new IndexSearcher(reader);

indexSearcher.setSimilarity(new IKSimilarity());

//查询关键词

String keyWords = “是一个开放源代码的全文检索引擎工具包 “;

//IKAnalyzer中文分词生成查询

Query query = IKQueryParser.parse(“content”, keyWords);

TopDocs topDocs = indexSearcher.search(query, Integer.MAX_VALUE);

System.out.println(topDocs.totalHits);

//对获取到的文档进行解析输出

ScoreDoc[] scoreDosArray = topDocs.scoreDocs;

for(ScoreDoc scoredoc: scoreDosArray){

Document doc = indexSearcher.doc(scoredoc.doc);

System.out.println(“name: “+doc.getFieldable(“name”).stringValue());

System.out.println(“path: “+doc.getFieldable(“path”).stringValue());

}

}catch(Exception e){

e.printStackTrace();

}

3、工具类

package testlucene.util;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

public class FileUtil {

/**

* parse the file to string

* @param file

* @return

public static String parsefiletostring(File file) {

String strresult=””,tmp=””;

BufferedReader br=null;

try{

br=new BufferedReader(new FileReader(file));

while((tmp=br.readLine())!=null){

strresult+=tmp;

}

}catch(Exception e){

e.printStackTrace();

}finally{

if(null!=br){

try {br.close();} catch (IOException e) {e.printStackTrace();}

}

System.out.println(strresult);

return strresult;

}

[转载]lucene-3.5.0 IKIKAnalyzer3.2.5Stable_bin 中文分词建立索引和检索 - 裴东辉 - 博客园

相关推荐

热门标签

分类

链接表

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏