功能包括:创建索引、检索索引、高亮显示查询结果。分词使用的庖丁解牛。
使用前先下载相关的LuceneCore jar包、LuceneHighLighterjar包、庖丁解牛分分词jar包、庖丁解牛词典。并设定环境变量PAODING_DIC_HOME指向词典位置。
前两个可以到官方网站找,庖丁去http://code.google.com/p/paoding/downloads/list下载。
Lucene庖丁整合方式1:
1、将paoding-analysis.jar拷贝到项目的WEB-INF/lib目录;
2、接着需要设置环境变量PAODING_DIC_HOME,变量名:PAODING_DIC_HOME变量值:E:paodingdic
3、第三步将E:paodingsrc目录下的paoding-dic-home.properties属性文件拷贝到项目的src目录下,添加2行
paoding.dic.home.config-fisrt=this
paoding.dic.home=E:/paoding/dic
Lucene庖丁整合方式2:
修改E:paodingsrcpaoding-dic-home.properties,增加一行
paoding.dic.home=classpath:dic
然后运行ant重新生成一个庖丁jar,拷贝到lib下就OK了。
第一种方式便于更新字典,第二种便于移植。本例使用第二种方法整合。
关于庖丁环境的设置可以参考netpaodinganalysisConstants.java。
使用时注意LuceneCore和LuceneHighLighter的版本配置。我开始使用lucene-core-2.3.2.jar+Highlighter2.4,后台报错,明显的版本问题。现在使用的是Lucene 2.3.2 + Highlighter 2.2.0。
主要代码实现:
CreateIndex:创建索引文件
Java代码
- packagedemo;
- importjava.io.BufferedReader;
- importjava.io.File;
- importjava.io.FileInputStream;
- importjava.io.IOException;
- importjava.io.InputStreamReader;
- importjava.util.Date;
- importnet.paoding.analysis.analyzer.PaodingAnalyzer;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.index.IndexWriter;
- publicclassCreateIndex{
- publicvoidcreateIndex()throwsException{
- FilesurceFileDir=newFile("D:\save\source");
- FileindexFileDir=newFile("D:\save");
- //AnalyzerluceneAnalyzer=newStandardAnalyzer();
- AnalyzerluceneAnalyzer=newPaodingAnalyzer();//使用庖丁解牛分词法
- IndexWriterindexWriter=newIndexWriter(indexFileDir,luceneAnalyzer,true);///参数isEmpty是false表示增量索引
- File[]sourceFextFiles=surceFileDir.listFiles();
- longstartTime=newDate().getTime();
- //增加document到索引去
- for(inti=0;i<sourceFextFiles.length;i++){
- if(sourceFextFiles[i].isFile()
- &&sourceFextFiles[i].getName().endsWith(".txt")){
- System.out.println("File"+sourceFextFiles[i].getCanonicalPath()+"正在被索引....");
- Stringtemp=FileReaderAll(sourceFextFiles[i].getCanonicalPath(),"GBK");
- System.out.println(temp);
- Documentdocument=newDocument();
- FieldFieldPath=newField("path",sourceFextFiles[i].getPath(),Field.Store.YES,Field.Index.NO);
- FieldFieldBody=newField("body",temp,Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
- FieldFieldTitle=newField("title",temp,Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
- document.add(FieldPath);
- document.add(FieldBody);document.add(FieldTitle);
- indexWriter.addDocument(document);
- }
- }
- //optimize()方法是对索引进行优化
- indexWriter.optimize();
- indexWriter.close();
- //测试一下索引的时间
- longendTime=newDate().getTime();
- System.out.println("这花费了"+(endTime-startTime)+"毫秒来把文档增加到索引里面去!"
- +indexFileDir.getPath());
- }
- publicstaticStringFileReaderAll(StringFileName,Stringcharset)
- throwsIOException{
- BufferedReaderreader=newBufferedReader(newInputStreamReader(
- newFileInputStream(FileName),charset));
- Stringline=newString();
- Stringtemp=newString();
- while((line=reader.readLine())!=null){
- temp+=line;
- }
- reader.close();
- returntemp;
- }
- publicstaticvoidmain(String[]args){
- try{
- newCreateIndex().createIndex();
- }catch(Exceptione){
- e.printStackTrace();
- }
- }
- }
package demo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; public class CreateIndex { public void createIndex() throws Exception { File surceFileDir = new File("D:\save\source"); File indexFileDir = new File("D:\save"); //Analyzer luceneAnalyzer = new StandardAnalyzer(); Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法 IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引 File[] sourceFextFiles = surceFileDir.listFiles(); long startTime = new Date().getTime(); // 增加document到索引去 for (int i = 0; i < sourceFextFiles.length; i++) { if (sourceFextFiles[i].isFile() && sourceFextFiles[i].getName().endsWith(".txt")) { System.out.println("File " + sourceFextFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field FieldTitle = new Field("title", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody);document.add(FieldTitle); indexWriter.addDocument(document); } } // optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); // 测试一下索引的时间 long endTime = new Date().getTime(); System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + indexFileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } public static void main(String[] args) { try { new CreateIndex().createIndex(); } catch (Exception e) { e.printStackTrace(); } } }
QueryHighLighter:检索关键字并高亮显示
Java代码
- packagedemo;
- importjava.io.StringReader;
- importnet.paoding.analysis.analyzer.PaodingAnalyzer;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.BooleanClause;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TopDocCollector;
- importorg.apache.lucene.search.highlight.Highlighter;
- importorg.apache.lucene.search.highlight.QueryScorer;
- importorg.apache.lucene.search.highlight.SimpleFragmenter;
- importorg.apache.lucene.search.highlight.SimpleHTMLFormatter;
- importtest.TestLuceneHighlighter2;
- publicclassQueryHighLighter{
- privatestaticfinalStringFIELD_TITLE="title";
- privatestaticfinalStringFIELD_BODY="body";
- publicsynchronizedAnalyzergetAnalyzer(){
- returnnewPaodingAnalyzer();//此处使用"庖丁解牛"分词法,另外一种是中科院分词法
- }
- publicStringtest(StringqueryString,intbegin,intnumber){
- StringBuffersb=newStringBuffer();
- IndexSearcherisearcher=null;
- try{
- isearcher=newIndexSearcher("D:\save");
- BooleanClause.Occur[]clauses={BooleanClause.Occur.SHOULD,
- BooleanClause.Occur.SHOULD};
- TopDocCollectorcollector=newTopDocCollector(10);
- QueryParserqueryParse=newQueryParser(FIELD_TITLE,getAnalyzer());
- Queryquery=queryParse.parse(queryString);
- isearcher.search(query,collector);
- ScoreDoc[]hits=collector.topDocs().scoreDocs;
- //用这个进行高亮显示,默认是<b>..</b>
- //用这个指定<read>..</read>
- SimpleHTMLFormattersimpleHTMLFormatter=newSimpleHTMLFormatter("<b><fontcolor='red'>","</font></b>");
- //构造高亮
- //指定高亮的格式
- //指定查询评分
- Highlighterhighlighter=newHighlighter(simpleHTMLFormatter,newQueryScorer(query));
- //这个一般等于你要返回的,高亮的数据长度
- //如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少
- //太大,有时太浪费了。
- highlighter.setTextFragmenter(newSimpleFragmenter(Integer.MAX_VALUE));
- for(inti=begin;i<hits.length&&i<begin+number;i++){
- Documentdoc=isearcher.doc(hits[i].doc);
- Stringvalue=doc.get(FIELD_TITLE);
- Stringvalue2=doc.get(FIELD_BODY);
- //有三个参数
- //分析器
- //要解析的字段名
- //要解析的数据
- //System.out.println(highlighter.getBestFragment(getAnalyzer(),
- //FIELD_TITLE,doc.get(FIELD_TITLE)));
- if(value!=null){
- TokenStreamtokenStream=getAnalyzer().tokenStream(FIELD_TITLE,newStringReader(value));
- Stringstr=highlighter.getBestFragment(tokenStream,value);
- sb.append("<li><li>").append(str).append("<br/>");
- System.out.println(str);
- }
- }
- }catch(Exceptione){
- e.printStackTrace();
- }finally{
- if(isearcher!=null){
- try{
- isearcher.close();
- }catch(Exceptione){
- e.printStackTrace();
- }
- }
- }
- returnsb.toString();
- }
- publicstaticvoidmain(String[]args){
- TestLuceneHighlighter2t=newTestLuceneHighlighter2();
- StringqueryString="中华人民共和国";
- intbegin=0;
- intnumber=10;
- t.test(queryString,begin,number);
- }
- }
- packagedemo;
- importjava.io.StringReader;
- importnet.paoding.analysis.analyzer.PaodingAnalyzer;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.BooleanClause;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TopDocCollector;
- importorg.apache.lucene.search.highlight.Highlighter;
- importorg.apache.lucene.search.highlight.QueryScorer;
- importorg.apache.lucene.search.highlight.SimpleFragmenter;
- importorg.apache.lucene.search.highlig