Lucene .NET 全文检索

来源:转载


近期做项目中有用到过Lucene,那个模块是由一位前端大神负责的,空闲时间我也做了个关于Lucene做全文检索的Demo,记录下来,方便以后学习。
关于Lucene的原理,网上有长篇大论的文章,有兴趣的话可以去阅读,再次我就直奔主题,在代码中分析其原理。

1、创建索引(此处我用的是盘古分词)

注:在后台代码的第一行上加上 #define notes这样一行代码,目的是可以用外侧代码的#if,作用嘛 用过之后就很明白了,嘿嘿。

 #region 创建索引 void CreateIndex(object sender, EventArgs e) /// <summary> /// 创建索引 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void CreateIndex(object sender, EventArgs e) { //索引存放的物理路径 //this.CreateDirectory(); //给 indexPath 赋值 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory()); bool isUpdate = IndexReader.IndexExists(directory); //判断索引库文件夹存在并且存在索引库特征文件 if (isUpdate) { //同时只能有一段代码对索引库进行写操作!当使用IndexWriter打开directory的时候会自动给索引库上锁。!!! //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁 if (IndexWriter.IsLocked(directory)) //如果索引库文件被锁定了 解锁 { IndexWriter.Unlock(directory); } } //IndexWriter writer = new IndexWriter(indexPath, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); //该方法已过时。 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); IEnumerable<Story> list = bllHelper.GetAllStory(); foreach (Story story in list) { writer.DeleteDocuments(new Term("ID", story.ID.ToString())); Document document = new Document(); //一篇文章,一部小说 //要进行全文检索的字段要设置 Field.Index.ANALYZED !!!!!!!!!!!!!!!!!!!!!!!!!! document.Add(new Field("ID", story.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Title", story.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); document.Add(new Field("Author", story.Author, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Content", story.Content, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); document.Add(new Field("URL", story.URL, Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(document); } writer.Close(); directory.Close(); } #endregion

2.接下来就是搜索了

 #region 搜索 IEnumerable<Story> Search(string keyWord) /// <summary> /// 搜索 /// </summary> /// <param name="keyWords">关键字</param> private IEnumerable<Story> Search(string keyWord) { FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //多条件查询 //搜索条件 PhraseQuery queryTitle = new PhraseQuery(); //把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件 foreach (string word in CommonHelper.SplitWords(keyWord)) { queryTitle.Add(new Term("Title", word)); } queryTitle.SetSlop(100); //多个查询条件的词之间的最大距离。在文章中相隔太远一般也就无意义 //搜索条件 PhraseQuery queryContent = new PhraseQuery(); //把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件 foreach (string word in CommonHelper.SplitWords(keyWord)) { queryContent.Add(new Term("Content", word)); } queryContent.SetSlop(100); //用BooleanQuery把多个查询条件拼接起来成为一个大的查询条件 BooleanQuery query = new BooleanQuery(); query.Add(queryTitle, BooleanClause.Occur.SHOULD);//可以有 query.Add(queryContent, BooleanClause.Occur.SHOULD);//可以有#if !notes //组合关系代表的意思如下: //1、MUST和MUST表示“与”的关系,即“并集”。 //2、MUST和MUST_NOT前者包含后者不包含。 //3、MUST_NOT和MUST_NOT没意义 //4、SHOULD与MUST表示MUST,SHOULD失去意义; //5、SHOUlD与MUST_NOT相当于MUST与MUST_NOT。 //6、SHOULD与SHOULD表示“或”的概念。 #endif //create 一个存储查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档 List<Story> list = new List<Story>(); foreach (ScoreDoc doc in docs) { int docID = doc.doc; //得到查询结果文档的id(Lucene内部分配的id) Document document = searcher.Doc(docID); //根据ID找到对应的Document Story story = new Story(); story.ID = Convert.ToInt32(document.Get("ID")); story.Title = CommonHelper.Highlight(keyWord, document.Get("Title")); story.Author = document.Get("Author"); story.Content = CommonHelper.Highlight(keyWord, document.Get("Content")); //story.Content = document.Get("Content"); story.URL = document.Get("URL"); list.Add(story); } return list; } #endregion

3.帮助类文件

3.1 BusinessHelper类

 #region 根据ID获取小说 +Story GetStoryById(int id) /// <summary> /// 根据ID获取小说 /// </summary> /// <param name="id">ID</param> /// <returns></returns> public Story GetStoryById(int id) { string sql = "SELECT * FROM Story nolock WHERE Id = @Id"; using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql, new SqlParameter("@Id", id))) { if (reader.Read()) { return ToModel(reader); } else { return null; } } } #endregion #region 获取所有的小说 +IEnumerable<Story> GetAllStory() /// <summary> /// 获取所有的小说 /// </summary> /// <returns></returns> public IEnumerable<Story> GetAllStory() { var list = new List<Story>(); string sql = "SELECT * FROM Story nolock"; using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql)) { while (reader.Read()) { list.Add(ToModel(reader)); } } return list; } #endregion #region 把SqlDataReader转换成实体 Story ToModel(SqlDataReader reader) /// <summary> /// 把SqlDataReader转换成实体 /// </summary> /// <param name="reader"></param> /// <returns></returns> private Story ToModel(SqlDataReader reader) { Story story = new Story(); story.ID = (int)ToModelValue(reader, "Id"); story.Title = (string)ToModelValue(reader, "Title"); story.Author = (string)ToModelValue(reader, "Author"); story.Content = (string)ToModelValue(reader, "Content"); story.URL = (string)ToModelValue(reader, "URL"); return story; } #endregion private object ToDBValue(object value) { if (value == null) { return DBNull.Value; } else { return value; } } private object ToModelValue(SqlDataReader reader, string columnName) { if (reader.IsDBNull(reader.GetOrdinal(columnName))) { return null; } else { return reader[columnName]; } }

3.2 CommonHelper类

 /// <summary> /// 把用户传入的字符串s分割成一个个的词 /// </summary> /// <param name="s"></param> /// <returns></returns> public static string[] SplitWords(string s) { List<string> list = new List<string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(s)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) //Next继续分词,如果没有更多词,则返回null { list.Add(token.TermText());//得到分到的词 } return list.ToArray(); } public static string Highlight(string keyword, string content) { try { //创建HTMLFormatter,参数为高亮单词的前后缀 PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=/"red/"><b>", "</b></font>"); //创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment()); //设置每个摘要段的字符数 highlighter.FragmentSize = 5000; //获取最匹配的摘要段 string result = highlighter.GetBestFragment(keyword, content); if (string.IsNullOrEmpty(result)) { return content; } else { return result; } } catch { return content; } }

3.3 SqlHelper 类

 public static string CONNECTIONSTRING = ConfigurationManager.ConnectionStrings["connLuceneDB"].ConnectionString; #region 执行查询方法 +static DataTable ExecuteDataTable(string sql) /// <summary> /// 执行查询方法 /// <para>返回DataTable</para> /// </summary> /// <param name="sql">sql语句</param> /// <param name="list"></param> public static DataTable ExecuteDataTable(string sql) { using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING)) { conn.Open(); using (SqlCommand cmd = new SqlCommand(sql, conn)) { SqlDataAdapter da = new SqlDataAdapter(cmd); DataTable dt = new DataTable(); da.Fill(dt); return dt; } }; } #endregion #region 执行查询方法,返回DataReader对象 +static SqlDataReader ExecuteDataReader(string cmdText,params SqlParameter[] parameters) /// <summary> /// 执行查询方法,返回DataReader对象 /// </summary> /// <param name="cmdText"></param> /// <param name="parameters"></param> /// <returns></returns> public static SqlDataReader ExecuteDataReader(string cmdText, params SqlParameter[] parameters) { SqlConnection conn = new SqlConnection(CONNECTIONSTRING); conn.Open(); using (SqlCommand cmd = conn.CreateCommand()) { cmd.CommandText = cmdText; cmd.Parameters.AddRange(parameters); return cmd.ExecuteReader(CommandBehavior.CloseConnection); } } #endregion #region 执行 增、删、改 的方法 +static void ExecuteNonQuery(string sql, out bool flag) /// <summary> /// 执行 增、删、改 的方法 /// </summary> /// <param name="sql">SQL语句</param> /// <returns>返回执行结果 true OR false</returns> public static bool ExecuteNonQuery(string sql) { var flag = false; using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING)) { conn.Open(); using (SqlCommand cmd = new SqlCommand(sql, conn)) { flag = cmd.ExecuteNonQuery() > 0 ? true : false; } }; return flag; } #endregion

4.小说实体类

 /// <summary> /// 小说 实体类 /// </summary> public class Story { /// <summary> /// 小说编号 /// </summary> public int ID { get; set; } /// <summary> /// 小说标题 /// </summary> public string Title { get; set; } /// <summary> /// 作者 /// </summary> public string Author { get; set; } /// <summary> /// 小说内容 /// </summary> public string Content { get; set; } /// <summary> /// 小说在线阅读地址 /// </summary> public string URL { get; set; } }

5.前台

<form id="form1" runat="server" method="post"> <asp:TextBox ID="txtKW" runat="server" Width="291px"></asp:TextBox> <asp:Button ID="btnSearch" runat="server" Text="搜索" onclick="btnSearch_Click" />                  <asp:Button ID="btnCreateIndex" runat="server" Text="创建索引" onclick="btnCreateIndex_Click"/> <asp:GridView ID="gdvShowStory" runat="server" AutoGenerateColumns="False" CellPadding="4" ForeColor="#333333" GridLines="None"> <AlternatingRowStyle BackColor="White" ForeColor="#284775" /> <Columns> <asp:TemplateField HeaderStyle-Width="3%"> <HeaderTemplate> 编号 </HeaderTemplate> <ItemTemplate> <asp:Label ID="Label1" runat="server" Text='<%# Eval("ID") %>'></asp:Label> </ItemTemplate> </asp:TemplateField> <asp:TemplateField HeaderStyle-Width="10%"> <HeaderTemplate> 标题 </HeaderTemplate> <ItemTemplate> <asp:Label ID="Label2" Text='<%# Eval("Title") %>' runat="server"></asp:Label> </ItemTemplate> </asp:TemplateField> <asp:TemplateField HeaderStyle-Width="8%"> <HeaderTemplate> 作者 </HeaderTemplate> <ItemTemplate> <asp:Label ID="Label2" Text='<%# Eval("Author") %>' runat="server"></asp:Label> </ItemTemplate> </asp:TemplateField> <asp:TemplateField HeaderStyle-Width="70%"> <HeaderTemplate> 内容 </HeaderTemplate> <ItemTemplate> <asp:Label ID="Label2" Text='<%# Eval("Content") %>' runat="server"></asp:Label> </ItemTemplate> </asp:TemplateField> <asp:TemplateField HeaderStyle-Width="5%"> <HeaderTemplate> 操作 </HeaderTemplate> <ItemTemplate> <a href='<%#Eval("URL") %>'>在线阅读</a> </ItemTemplate> </asp:TemplateField> </Columns> <EditRowStyle BackColor="#999999" /> <FooterStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" /> <HeaderStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" /> <PagerStyle BackColor="#284775" ForeColor="White" HorizontalAlign="Center" /> <RowStyle BackColor="#F7F6F3" ForeColor="#333333" /> <SelectedRowStyle BackColor="#E2DED6" Font-Bold="True" ForeColor="#333333" /> <SortedAscendingCellStyle BackColor="#E9E7E2" /> <SortedAscendingHeaderStyle BackColor="#506C8C" /> <SortedDescendingCellStyle BackColor="#FFFDF8" /> <SortedDescendingHeaderStyle BackColor="#6F8DAE" /> </asp:GridView> </form>

注:需要引入几个类库

 

 

 

 

 

OK,到此为止,一个简单的Demo出来了,看看效果吧:

 

 

 

 

 

 

 

 

 

 

 



分享给朋友:
您可能感兴趣的文章:
随机阅读: