走进lucene - 创建索引、检索

来源:转载


看过王老师的信息检索导论,一直想学lucene,看看从实现的角度,搜索引擎是如何工作的。

正好工作中用到一点lucene,尝试使用,为后续分析做好准备。

截止到现在,lucene已经更新到4.8了,适配Java8,并做了很多封装。

自己会用3.6来进行学习。因为阅读lucene源码还是4版本一下的更为合适,而且一些简单查询,3.6也足够用了。

1 构建索引

public boolean indexBuild(String indexPath, String inputFile) { boolean suc = true; try { Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); boolean create = true; if (create) { // Create a new index in the directory, // removing any previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); FileInputStream fis = new FileInputStream(inputFile); // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: BufferedReader reader = new BufferedReader(new InputStreamReader( fis, "UTF-8")); String line = null; while ((line = reader.readLine()) != null) { int beginPos = line.indexOf("/t"); String hid = line.substring(0, beginPos); String tags = line.substring(beginPos+1); Field hidField = new Field("hid", hid, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); hidField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(hidField); Field tagsField = new Field("tags", tags, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); tagsField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(tagsField); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document // can be there): System.out.println("adding " + inputFile); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: writer.updateDocument(new Term("path", inputFile), doc); } } writer.close(); } catch (IOException e) { e.printStackTrace(); } return suc; }


2 检索

最开始的时候使用QueryParser生成Query,发现不需要分析的字段,检索不出结果来。改为有Term生成Query就OK了。

 private IndexReader reader = null; private Analyzer analyzer = null; private IndexSearcher searcher = null; public boolean init(String indexPath) { boolean suc = true; try { reader = IndexReader.open(FSDirectory.open(new File(indexPath))); searcher = new IndexSearcher(reader); analyzer = new StandardAnalyzer(Version.LUCENE_36); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return suc; } public String queryTags(String hid) { String result = null;// QueryParser parser = new QueryParser(Version.LUCENE_36, "hid", analyzer); try { Term term = new Term("hid", hid); Query query = new TermQuery(term); int hitsPerPage = 1; // Collect enough docs to show 5 pages TopDocs results = searcher.search(query, 5 * hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); String str; int start = 0; // end pos for return docs int end = Math.min(numTotalHits, hitsPerPage); boolean raw = false; for (int i = start; i < end; i++) { if (raw) { // output raw format String log = "doc=" + hits[i].doc + " score=" + hits[i].score; System.out.println(log); continue; } Document doc = searcher.doc(hits[i].doc); String tags = doc.get("tags"); if (tags != null) { //just for work result = tags; break; //System.out.println((i + 1) + ". " + tags); } else { String log = (i + 1) + " No tags for " + hid; System.out.println(log); } } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return result; } public void close() { try { searcher.close(); reader.close(); } catch (IOException e) { e.printStackTrace(); } }





分享给朋友:
您可能感兴趣的文章:
随机阅读: