Lucene利用Hanlp分词索引和检索

包引入

  • pom引入lucene和hanlp
  • lucene版本为7.4
  • hanlp分词器版本为1.1.6

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
    <version>7.4.0</version>
    </dependency>

    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>7.4.0</version>
    </dependency>

    <dependency>
    <groupId>com.hankcs.nlp</groupId>
    <artifactId>hanlp-lucene-plugin</artifactId>
    <version>1.1.6</version>
    </dependency>

    对数据文件进行索引

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    public class DocIndex {

    // 建立索引文件的目录
    public static String indexPath = "/index";

    public static void index(String path, Map<String, SugRecord> map, boolean reIndex) {

    indexPath = path + indexPath;

    if (!reIndex) {
    return;
    }

    IndexWriter writer = null;
    try {
    // 存储索引数据的目录
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    // 创建分析器
    Analyzer analyzer = new HanLPAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    writer = new IndexWriter(dir, iwc);
    indexDoc(writer, map);
    writer.close();
    } catch (IOException e) {
    e.printStackTrace();
    }
    }

    private static void indexDoc(IndexWriter writer, Map<String, SugRecord> map) {
    map.forEach((k, v) -> {
    // 创建一个新的空文档
    Document doc = new Document();
    //创建索引字段
    doc.add(new TextField("id", v.getId(), Field.Store.YES));
    doc.add(new TextField("question", v.getQueryNorm(), Field.Store.NO));
    doc.add(new TextField("query", v.getQuery(), Field.Store.NO));
    // 写文档
    try {
    writer.addDocument(doc);
    } catch (IOException e) {
    e.printStackTrace();
    }
    });
    try {
    writer.flush();
    writer.commit();
    writer.close();
    } catch (IOException e) {
    e.printStackTrace();
    }
    }
    }

对索引进行检索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@Slf4j
public class DocSearcher {

public static List<SugTerm> search(String content) {
IndexReader reader = null;
try {
reader = DirectoryReader.open(FSDirectory.open(Paths.get(DocIndex.indexPath)));
} catch (IOException e) {
e.printStackTrace();
}
IndexSearcher searcher = new IndexSearcher(reader);

//或关系,多域检索
String[] fields = {"question", "query"};
BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
try {
//指定分词器
Query query = MultiFieldQueryParser.parse(content, fields, clauses, new HanLPAnalyzer());
return doPagingSearch(searcher, query);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

private static List<Term> doPagingSearch(IndexSearcher searcher, Query query) throws IOException {
// TopDocs保存搜索结果
TopDocs results = null;
List<Term> termList = new ArrayList<>();
//分页数
results = searcher.search(query, 10);
ScoreDoc[] hits = results.scoreDocs;
for (ScoreDoc hit : hits) {
Document document = searcher.doc(hit.doc);
String id = Objects.requireNonNull(document).get("id");
Term term = new Term();
term.id = id;
term.score = hit.score;
term.add(sugTerm);
}
log.info(termList.toString());
return termList;
}
}
------ 本文结束------

本文标题:Lucene利用Hanlp分词索引和检索

文章作者:Perkins

发布时间:2019年08月19日

原始链接:https://perkins4j2.github.io/posts/49532/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。