千度索引最新千度搜索引擎正常

2024-07-15 15:37| 来源: 网络整理| 查看: 265

1、新建Web项目

新建一个Web项目，我命名为SearchEngine，然后导入Java包：

除了上篇博客中的Jar包外，我还引入了 IKAnalyzer2012_FF.jar 包和struts2的相关包：

IKAnalyzer：是用来进行中文分词的一个jar包，他会把中文分词一个个合理的词来进行检索；

Struts2:一会儿搜索结果，使用Struts2展示到浏览器中；

2.准备数据源

我使用linux 命令 wget 爬了一个网站内的一部分html网页，同样将它放在一个纯英文的目录：

千度索引最新千度搜索引擎正常_lucene

3、创建索引

新建一个类CreateIndex:

千度索引最新千度搜索引擎正常_千度索引最新_02

import java.io.File; import java.io.IOException; import java.util.Collection; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.TrueFileFilter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; import com.HtmlBeanUtil; import com.model.HtmlBean; public class CreateIndex { public static final String DATA_DIR="E:/data/engine/www.bjsxt.com"; public static final String INDEX_DIR="E:/data/engine/index"; public void createIndex() throws IOException{ FSDirectory dir = FSDirectory.open(new File(INDEX_DIR)); // 使用中文分词的jar包进行分词 IKAnalyzer analyzer = new IKAnalyzer(true); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, config); File file = new File(DATA_DIR); Collection files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); for(File f : files){ // 将原数据源内的内容通过抓取，返回一个实体类方便存储 HtmlBean hb = HtmlBeanUtil.parseHtml(f); if(hb!=null && hb.getTitle()!=null && !hb.getTitle().trim().equals("")){ Document doc = new Document(); // 存储三个内容，标题，内容，url （实际上内容可能会更多比如关键字，描述等） doc.add(new TextField("title",hb.getTitle(), Store.YES)); doc.add(new TextField("content",hb.getContent(), Store.YES)); doc.add(new TextField("url",hb.getUrl(), Store.YES)); writer.addDocument(doc); } } writer.close(); } }

千度索引最新千度搜索引擎正常_千度索引最新_02

实体HtmlBean和HtmlBeanUtil:

千度索引最新千度搜索引擎正常_千度索引最新_02

public class HtmlBean { private String title; private String content; private String url; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } }

千度索引最新千度搜索引擎正常_千度索引最新_02

import java.io.File; import java.io.IOException; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import com.model.HtmlBean; public class HtmlBeanUtil { public static HtmlBean parseHtml(File file){ try { Source source = new Source(file); Element title = source.getFirstElement(HTMLElementName.TITLE); String content = source.getTextExtractor().toString(); HtmlBean hb = new HtmlBean(); if(title==null || title.getTextExtractor() == null){ return null; } hb.setTitle(title.getTextExtractor().toString()); hb.setContent(content); String path = file.getAbsolutePath(); String url = "http://"+path.substring(15); url = url.replace("\\", "/"); hb.setUrl("http://"+path.substring(15)); return hb; } catch (IOException e) { e.printStackTrace(); } return null; } }

千度索引最新千度搜索引擎正常_千度索引最新_02

使用单元测试跑一下创建索引的方法，最后会得到这么几个索引数据库文件：

千度索引最新千度搜索引擎正常_apache_08

4、创建检索类SearchIndex:

千度索引最新千度搜索引擎正常_千度索引最新_02

import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.model.HtmlBean; import com.model.Page; public class SearchIndex { public Page search(String keyWord,int pageNum,int pageSize) throws IOException, ParseException, InvalidTokenOffsetsException{ Directory dir = FSDirectory.open(new File(CreateIndex.INDEX_DIR)); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); // 使用中文分词器把用户输入的内容进行分词 Analyzer analyzer = new IKAnalyzer(true); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "title", analyzer); Query query = parser.parse(keyWord); //format 用来制定要高亮显示的词的样式 SimpleHTMLFormatter format = new SimpleHTMLFormatter("",""); Highlighter high = new Highlighter(format ,new QueryScorer(query)); // pageNum*pageSize 控制显示的最大条数 TopScoreDocCollector results = TopScoreDocCollector.create(pageNum*pageSize, false); searcher.search(query, results); // 检索出来想要的结果的条数，可以实现分页 TopDocs topDocs = results.topDocs((pageNum-1)*pageSize, pageNum*pageSize); Page page = new Page(); page.setPageNum(pageNum); page.setPageSize(pageSize); page.setTotalCount(topDocs.totalHits); ScoreDoc[] docs = topDocs.scoreDocs; List list = new ArrayList(); for(ScoreDoc scoreDoc : docs){ Document document = reader.document(scoreDoc.doc); String title = document.get("title"); String content = document.get("content"); String url = document.get("url"); //获取到检索的结果以后，可以使用Highlighter获取高亮效果 title = high.getBestFragment(analyzer, "title", title); content = high.getBestFragment(analyzer, "content", content); HtmlBean hb = new HtmlBean(); hb.setTitle(title); hb.setContent(content); hb.setUrl(url); list.add(hb); } // 计算记录的总页数 if(page.getTotalCount()

【本文地址】

公司简介

联系我们

千度索引最新 千度搜索引擎正常

千度索引最新千度搜索引擎正常