Lucene从入门到熟悉(三)分词

来源:转载


分词

分词是用来对文本按语言特征按算法进行过滤、分组处理一种技术。分词的对象是文本,而不是图像动画脚本等等分词的方式就是过滤分组过滤主要是把文本中那些没有实际意义的字或词过滤掉分组就是按照"分词数据库"内已添加好的词进行匹配。
Lucene提供的分词器// Analyzer   analyzer=new StandardAnalyzer();
// Analyzer   analyzer=new SimpleAnalyzer();   
// Analyzer   analyzer=new WhitespaceAnalyzer();
// Analyzer   analyzer=new ChineseAnalyzer();
// Analyzer   analyzer=new CJKAnalyzer();  // 两个汉字一组


package com.lucene.test.T03;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.SimpleAnalyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.cjk.CJKAnalyzer;import org.apache.lucene.analysis.cn.ChineseAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.wltea.analyzer.lucene.IKAnalyzer;public class TestAnalyzer { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // Analyzer analyzer=new StandardAnalyzer();// Analyzer analyzer=new SimpleAnalyzer();// Analyzer analyzer=new WhitespaceAnalyzer();// Analyzer analyzer=new ChineseAnalyzer();// Analyzer analyzer=new CJKAnalyzer(); Analyzer analyzer=new IKAnalyzer(); TokenStream tokenStream=analyzer.tokenStream("", new StringReader("welcome to use lucene! ?"));// TokenStream tokenStream=analyzer.tokenStream("", new StringReader("明天会更美好!")); Token token =new Token(); while(tokenStream.next(token)!=null) { System.out.println(token.term()); } }}


paoding开源中文分词器
package com.lucene.test.T03;import java.io.IOException;import java.io.StringReader;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class AnalyzerPaoding { private static Logger logger = LoggerFactory .getLogger(AnalyzerPaoding.class); public static void main(String[] args) throws IOException { Analyzer analyzer = new PaodingAnalyzer(); TokenStream ts = analyzer.tokenStream("", new StringReader("法律实践奥利弗论文集饿哦土建类士大夫接待来访将阿隆索")); Token token = new Token(); while ((token = ts.next()) != null) { logger.debug("read result from token"); System.out.println(token.term()); } }}

paoding中用Queryparse
package com.lucene.test.T03;import java.io.IOException;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class TestIndexPaoding { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { String[] ids = { "1", "2", "3", "4" }; String[] names = { "张三", "李四", "王五", "赵六" }; // String[] names = { "zhangsan", "zhangsun", "zhangson", "zhaoliu" }; String[] address = { "居住北京", "南京", "北京海淀", "dalian" }; String[] birthday = { "19880101", "19860105", "19760205", "19550719" }; Analyzer analyzer = new PaodingAnalyzer(); String indexDir = "c:/temp/luceneindex"; Directory dir = FSDirectory.getDirectory(indexDir); // true 表示创建或覆盖当前索引;false表示对当前索引进行追加 // Default value is 128 IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); for (int i = 0; i < ids.length; i++) { Document document = new Document(); document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED)); // Field.Index.NO表示不建立索引 document.add(new Field("address", address[i], Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("birthday", birthday[i], Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(document); } writer.optimize(); writer.close(); System.out.println("index created ...."); }}

package com.lucene.test.T03;import java.io.IOException;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocCollector;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class TestSearcherPaoding { public static void main(String[] args) throws IOException, ParseException { String indexDir = "c:/temp/luceneindex"; Directory dir = FSDirectory.getDirectory(indexDir); IndexSearcher searcher = new IndexSearcher(dir); Analyzer analyzer = new PaodingAnalyzer(); ScoreDoc[] hits = null; QueryParser parser = new QueryParser("name", analyzer); Query qury = parser.parse("address:北京 AND NOT name:张三");// 高级查询(adress包含北京,但name不为张三的) TopDocs topDocs = searcher.search(qury, 10); hits = topDocs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.print(hits[i].score + " "); System.out.print(doc.get("id") + " "); System.out.print(doc.get("name") + " "); System.out.print(doc.get("address") + " "); System.out.println(doc.get("birthday") + " "); } searcher.close(); dir.close(); }}


读取文件建立索引并查询:
package com.lucene.test.T04;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class TestFileIndex { private static Logger logger = LoggerFactory.getLogger(TestFileIndex.class); public static void main(String[] args) throws FileNotFoundException, IOException { String indexDir = "c:/temp/lucene/index"; Analyzer analyzer = new PaodingAnalyzer(); Directory dir = FSDirectory.getDirectory(indexDir); IndexWriter writer = new IndexWriter(indexDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); // read data from dataDir and create index String dataDir = "c:/temp/lucene/data"; File[] files = new File(dataDir).listFiles(); System.out.println("file numbers:" + files.length); for (int i = 0; i < files.length; i++) { // read file content StringBuffer strBuff = new StringBuffer(); String line = ""; FileInputStream is = new FileInputStream(files[i].getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(is)); line = br.readLine(); while (line != null) { strBuff.append(line); strBuff.append("/n"); line = br.readLine(); } // create index Document document = new Document(); document.add(new Field("title", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("content", strBuff.toString(), Field.Store.YES, Field.Index.ANALYZED)); // write index writer.addDocument(document); is.close(); br.close(); } writer.close(); dir.close(); }}

package com.lucene.test.T04;import java.io.IOException;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class TestFileSearch { public static void main(String[] args) throws IOException, ParseException { String indexDir = "c:/temp/lucene/index"; Directory dir = FSDirectory.getDirectory(indexDir); IndexSearcher searcher = new IndexSearcher(dir); Analyzer analyzer = new PaodingAnalyzer(); ScoreDoc[] hits = null; QueryParser parser = new QueryParser("content", analyzer); Query qury = parser.parse("软件"); TopDocs topDocs = searcher.search(qury, 10); hits = topDocs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); // System.out.print(hits[i].score + " "); System.out.print(doc.get("title") + " "); System.out.print(doc.get("content") + " "); } searcher.close(); dir.close(); }}

高亮

package com.lucene.test.T05;import java.io.StringReader;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.TopDocCollector;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;public class TestHighlight { public static void main(String[] args) throws Exception { Searcher searcher = new IndexSearcher("c:/temp/lucene/index"); Analyzer analyzer = new PaodingAnalyzer(); String filed = "content"; String queryStr = "分词"; QueryParser parser = new QueryParser(filed, analyzer); Query query = parser.parse(queryStr); TopDocCollector collector = new TopDocCollector(10); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // highlight setup Highlighter highlight = null; SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<font color='red'>", "</font>"); highlight = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); highlight.setTextFragmenter(new SimpleFragmenter(200)); Document doc; for (int i = 0; i < hits.length; i++) { System.out.println(hits[i].doc); System.out.println("---------------------------------------1"); System.out.println(hits[i].score); System.out.println("---------------------------------------2"); doc = searcher.doc(hits[i].doc);// System.out.println(doc.toString()); System.out.println("---------------------------------------3"); // hightlight view TokenStream tokenStream = new PaodingAnalyzer().tokenStream( "content", new StringReader(doc.get("content"))); System.out.println(highlight.getBestFragment(tokenStream, doc.get("content"))); } }}




分享给朋友:
您可能感兴趣的文章:
随机阅读: