Lucene 同义词搜索

来源:转载


1、自定义TokenFilter过滤器

package com.lkt.analyzer;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;/** * 定義同義詞過濾器 * @author lkt * */public class MyMmsegFilter extends TokenFilter { //用來存儲同義詞集合 private Map<String, String[]> sameMap=new HashMap<String, String[]>(); //用來存儲當前詞的同義詞 private Stack<String> sameStack; private int flag; //存儲當前狀態 private AttributeSource.State currState; private CharTermAttribute cta; private PositionIncrementAttribute pia; protected MyMmsegFilter(TokenStream input) { super(input); sameMap.put("中国", new String[]{"兲朝","大陸","China"}); sameMap.put("北京", new String[]{"首都","燕京","Beijing"}); sameMap.put("南京", new String[]{"六朝古都","建業","Nanjing"}); cta = this.addAttribute(CharTermAttribute.class); pia = this.addAttribute(PositionIncrementAttribute.class); sameStack=new Stack<String>(); } @Override public boolean incrementToken() throws IOException { while(sameStack.size()>0){ String str = sameStack.pop(); //還原狀態 restoreState(currState); cta.setEmpty(); cta.append(str); //設置它和前一個單詞的距離 pia.setPositionIncrement(0); return true; } if(!input.incrementToken())return false; if(getSameWord(cta.toString())){ //獲取當前狀態,使用restoreState可以返回記錄的狀態 currState=captureState(); flag=0; } return true; } private boolean getSameWord(String word){ String[] sm = sameMap.get(word); if(sm!=null&&sm.length>0){ for (String s : sm) { sameStack.push(s); } return true; } return false; }}
2、自定义分词器

package com.lkt.analyzer;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.store.Directory;import com.chenlb.mmseg4j.Chunk;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.Sentence;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MyMmsegAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { Dictionary dic =Dictionary.getInstance("F://学习资料//Lucene//mmseg4j-1.8.5//data"); return new MyMmsegFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)) ; }}


3、测试

package com.lkt.lucene;import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import com.lkt.analyzer.MyMmsegAnalyzer;import com.lkt.analyzer.MyStopAnalyzer;import com.lkt.util.AnalyzerUtil;public class TestAnalyzerUtil { @Test public void testDisplayAnalyzer(){ String str = "北京上海南京江苏南京北京中国重庆天津";// new AnalyzerUtil().displayAnalyzer(str,new StandardAnalyzer(Version.LUCENE_35) );// new AnalyzerUtil().displayAnalyzer(str,new StopAnalyzer(Version.LUCENE_35) );// new AnalyzerUtil().displayAnalyzer(str,new SimpleAnalyzer(Version.LUCENE_35) );// new AnalyzerUtil().displayAnalyzer(str,new WhitespaceAnalyzer(Version.LUCENE_35) );// new AnalyzerUtil().displayAnalyzer(str,new MyStopAnalyzer(new String[]{"my","dog"}) ); try { //new AnalyzerUtil().displayAnalyzer(str,new MyMmsegAnalyzer()); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_35,new MyMmsegAnalyzer())); Document doc =new Document(); doc.add(new Field("content",str,Store.YES,Index.ANALYZED)); writer.addDocument(doc); writer.close(); IndexSearcher sercher =new IndexSearcher(IndexReader.open(dir)); TermQuery tq =new TermQuery(new Term("content","首都")); TopDocs td= sercher.search(tq, 10); for (ScoreDoc sd : td.scoreDocs) { Document dd = sercher.doc(sd.doc); System.out.println(dd.get("content")); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }}





分享给朋友:
您可能感兴趣的文章:
随机阅读: