lucene给文本索引和搜索功能的应用

来源:转载


最近一段时间由于公司需要 ,模糊搜索出相似的关键词,所以直接考虑使用了lucene。

lucene允许你往程序中添加搜索功能,lucene能够把你从文本中解析出来的数据进行索引和搜索 ,lucene不关心数据来源 甚至不关心语种,不过你需要把它转换成文本格式。也就是说你可以搜索 html网页,文本文档,word文档 ,pdf,或者其他一些 总之 只要能够提取出文本信息的即可。同样你也可以利用lucene来索引存储在数据库中的数据,以给你的用户提供一些  比如 全文搜索功能等 ,反正lucene的功能很是强大。里面还有很多开源的对不同语言进行分析的插件等。


下面我介绍一个例子 ,这里我进行对 一个txt文档的 每一行进行了 索引的添加 ,也就是说  把每一行 当作一个document对象来处理,实际上在lucene中 每一个document 相当于我们在数据库中的库名, 而每个field相当于我们的表名 ,它能够对文本进行自动处理去掉里面的一些语气词,它能把你规定的域当作关键词来进行索引 以备查询时使用,lucene比较容易使用 ,但是不如数据库灵活,速度很快。下面 我用一个例子来说明(这里我用的lucene4.7.2,最高版本 ,你需要注意把需要的一些jar包引入的到你的工程中,使用maven可直接引入依赖http://mvnrepository.com/artifact/org.apache.lucene需要的全部引入)我这里写了一个实例 你可以进行参考学习使用方法。

package lucene.home.clq;/** * @author chenlongquan * Copyright Manning Publications Co..com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan *///创建索引import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.Directory;import org.apache.lucene.util.Version;import java.io.BufferedReader;import java.io.File;import java.io.FileFilter;import java.io.FileInputStream;import java.io.IOException;import java.io.FileReader;import java.io.InputStreamReader;import java.io.LineNumberReader;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;/** * This code was originally build for the index * */public class Indexer { public static void main(String[] args) throws Exception { String indexDir = "f://index"; //1 String dataDir = "f://baidu"; //2 long start = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } private IndexWriter writer; public Indexer(String indexDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); writer = new IndexWriter(dir,indexWriterConfig()); //在这里进行索引的调试 } public void close() throws IOException { writer.close(); //4 } private IndexWriterConfig indexWriterConfig(){ Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer); return config;} public int index(String dataDir, FileFilter filter) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f: files) { if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) { indexFile(f); } } return writer.numDocs(); //5 } private static class TextFilesFilter implements FileFilter { public boolean accept(File path) { return path.getName().toLowerCase() //6 .endsWith(".txt"); //6 } } /** * 遍历每一个文件,然后读出文件中的每一行数据,当成一个document来处理 * @param f * @throws Exception */ private void indexFile(File f) throws Exception { System.out.println("Indexing " + f.getCanonicalPath()); // Document doc = getDocument(f); List<String> lists = readFileNoDup(f); for(String list:lists){ Document doc = new Document(); doc.add(new Field("contents",list,TextField.TYPE_STORED)); writer.addDocument(doc); } //10 } //读取一个文件 private List<String> readFile(File filePathAndName)throws IOException {FileInputStream fis = new FileInputStream(filePathAndName);InputStreamReader isr = new InputStreamReader(fis, "UTF-8");BufferedReader br = new BufferedReader(isr);LineNumberReader lnr = new LineNumberReader(br);List<String> returnValue = new ArrayList<String>();int cnt = 0;while (true) {cnt++;String tempStr = lnr.readLine();if (tempStr == null)break;if (tempStr.length() < 2)continue;returnValue.add(tempStr);}lnr.close();br.close();isr.close();fis.close();return returnValue;} //读取一个文件并排重后返回 public static List<String> readFileNoDup(File filePathAndName)throws IOException { FileInputStream fis = new FileInputStream(filePathAndName);InputStreamReader isr = new InputStreamReader(fis, "UTF-8");BufferedReader br = new BufferedReader(isr);LineNumberReader lnr = new LineNumberReader(br);Set<String> set = new HashSet<String>();while (true) {String tempStr = lnr.readLine();if (tempStr == null)break;if (tempStr.length() < 2)continue;set.add(tempStr.trim());}lnr.close();br.close();isr.close();fis.close();List<String> returnValue = new ArrayList<String>(set.size());returnValue.addAll(set);return returnValue; }}


//对刚才已经建好的索引进行搜索

 
package lucene.home.clq; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Sort;import org.apache.lucene.search.SortField;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.TopFieldCollector;import org.apache.lucene.search.TopFieldDocs;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.SimpleFSDirectory;import org.apache.lucene.util.Version;// From chapter 1/** * This code was originally written for searcher * */public class Searcher { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException { final String indexDir = "e://soso//soso"; String q = " ";//输入你添加的所以 进行模糊搜索 docs = query(indexDir, q) } public static void search(String indexDir, String q) throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir))); // Directory dir = FSDirectory.open(new File(indexDir)); //3 IndexSearcher is = new IndexSearcher(reader); //3 QueryParser parser = new QueryParser(Version.LUCENE_47,"contents",new SmartChineseAnalyzer(Version.LUCENE_47)); Query query = parser.parse(q); //4 long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 500); //5 //ScoreDoc[] hits = is.search(query, null, 10).scoreDocs; long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + //6 " document(s) (in " + (end - start) + // 6 " milliseconds) that matched query '" + // 6 q + "':"); // 6 for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); //7 System.out.println(doc.get("contents")); } reader.close(); } private static List<String> query(String indexDir, String searcher) throws IOException, ParseException{ if (searcher == null || searcher.length() == -1) { return null; } searcher = searcher.trim(); if (searcher.length() == 0) { return null; } IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));//open the index //IndexReader reader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir)));//open the index IndexSearcher is = new IndexSearcher(reader);//find the content QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47));//parser the content Query query = parser.parse(searcher); TopFieldDocs hits = is.search(query, 100, new Sort(new SortField("contents", SortField.Type.SCORE, false))); TopDocs hits1 = is.search(query, 200);//搜索出前200条数据 按照评分进行排序 List<String> list = new ArrayList<String>(); for(ScoreDoc scoreDoc : hits.scoreDocs){ Document doc = is.doc(scoreDoc.doc); list.add(doc.get("contents")); } reader.close(); return list; }}


//这里我主要给文档中的文本进行添加了索引 ,你也可以在Field 中给路径 等等一些属性进行添加索引   具体你可以搜索lucene api

进行使用 里面的一些方法。我这里说的比较粗,有问题欢迎讨论。

 




分享给朋友:
您可能感兴趣的文章:
随机阅读: