ElasticSearch
提供的分词是英语分词.对中文分词狠不友好.为了更好的展示搜索结果,我们决定使用中文的分词器.至于为什么用IK
,那是因为好多人都在用.我也跟风,教程又多.又简单~
版本要求
Ik | ES |
---|---|
master | 7.x -> master |
6.x | 5.x |
1.10.6 | 2.4.6 |
再老的版本应该没人用了吧...后面就不列举了 |
下载安装
一定要和上面的版本相对应,不然ES会报错无法启动的,千万别选错版本.选对应的
Releasses
选master绝壁版本不对应!!!cd elasticsearch-7.1.0/plugins/
# 然后把下载好的文件放进来就行.
mv elasticsearch-analysis-ik-7.1.0/ ik
重启ES.如果报错那百分之九十是版本不对应!注意我的ES版本是7.1.0
# 错误提示
Could not load plugin descriptor for plugin directory [elasticsearch-analysis-ik]
# 正常启动的话会提示,且不退出
[INFO ][o.e.n.Node ] [qvbilam_small_video_1] started
[WARN ][o.e.x.s.a.s.m.NativeRoleMappingStore] [qvbilam_small_video_1] Failed to clear cache for realms [[]]
[INFO ][o.e.l.LicenseService ] [qvbilam_small_video_1] license [d2b1dee8-d297-439c-b0c3-c09e18e105cd] mode [basic] - valid
[INFO ][o.e.g.GatewayService ] [qvbilam_small_video_1] recovered [4] indices into cluster_state
[o.w.a.d.Monitor ] [qvbilam_small_video_1] try load config from /Users/qvbilam/Sites/elasticSearch/elasticsearch-7.1.0/config/analysis-ik/IKAnalyzer.cfg.xml
[INFO ][o.w.a.d.Monitor ] [qvbilam_small_video_1] try load config from /Users/qvbilam/Sites/ElasticSearch/elasticsearch-7.1.0/plugins/ik/config/IKAnalyzer.cfg.xml
准备工作
# 创建名为qvbilam的索引
curl -XPUT http://127.0.0.1:8101/qvbilam
# 为qvbilam设置映射
curl -XPOST http://localhost:8101/qvbilam/_mapping -H 'Content-Type:application/json' -d'
{
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
}
}
}'
# 添加数据文件
vim test
{ "create": { "_index": "qvbilam", "_type": "_doc", "_id": "1" }}
{ "content":"我想成为巨大的人,比如185cm" }
{ "create": { "_index": "qvbilam", "_type": "_doc", "_id": "2" }}
{ "content":"牛魔王大战铁扇公主" }
{ "create": { "_index": "qvbilam", "_type": "_doc", "_id": "4" }}
{ "content":"zcx你可真是个二逼啊" }
{ "create": { "_index": "qvbilam", "_type": "_doc", "_id": "5" }}
{ "content":"二滑今天很开心啊" }
{ "create": { "_index": "qvbilam", "_type": "_doc", "_id": "6" }}
{ "content":"进击的巨人出第四季了哦" }
# 执行批量添加数据
curl -XPOST http://127.0.0.1:8101/_bulk -H "Content-Type: application/json" --data-binary @test
查看分词
使用IK分词
搜索个进击的二滑大魔王
吧:)
curl -XGET "http://localhost:8101/qvbilam/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"text":"进击的二滑大魔王","tokenizer": "ik_max_word"
}'
返回结果
{
"tokens" : [
{
"token" : "进击",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "的",
"start_offset" : 2,
"end_offset" : 3,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "二",
"start_offset" : 3,
"end_offset" : 4,
"type" : "TYPE_CNUM",
"position" : 2
},
{
"token" : "滑",
"start_offset" : 4,
"end_offset" : 5,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "大",
"start_offset" : 5,
"end_offset" : 6,
"type" : "CN_CHAR",
"position" : 4
},
{
"token" : "魔王",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 5
}
]
}
未用ik分词
curl -XGET "http://localhost:8101/qvbilam/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"text":"进击的二滑大魔王"
}'
返回结果
{
"tokens" : [
{
"token" : "进",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "击",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "的",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "二",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "滑",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "大",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "魔",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6
},
{
"token" : "王",
"start_offset" : 7,
"end_offset" : 8,
"type" : "<IDEOGRAPHIC>",
"position" : 7
}
]
}
还是可以看到明显的区别啊~虽然我希望二滑能组合起来...但是可以自己设置词典的.
高亮显示搜索
curl -XPOST http://localhost:8101/qvbilam/_search?pretty -H 'Content-Type:application/json' -d'
{
"query" : { "match" : { "content" : "进击的二滑大魔王" }},
"highlight" : {
"pre_tags" : ["<font color=\"RED\">", "<font color=\"RED\">"],
"post_tags" : ["</font>", "</font>"],
"fields" : {
"content" : {}
}
}
}
'
返回结果
{
"query" : { "match" : { "content" : "进击的二滑大魔王" }},
"highlight" : {
"pre_tags" : ["<font color=\"RED\">", "<font color=\"RED\">"],
"post_tags" : ["</font>", "</font>"],
"fields" : {
"content" : {}
}
}
}
'
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : 2.4975452,
"hits" : [
{
"_index" : "qvbilam",
"_type" : "_doc",
"_id" : "5",
"_score" : 2.4975452,
"_source" : {
"content" : "二滑今天很开心啊"
},
"highlight" : {
"content" : [
"<font color=\"RED\">二</font><font color=\"RED\">滑</font>今天很开心啊"
]
}
},
{
"_index" : "qvbilam",
"_type" : "_doc",
"_id" : "6",
"_score" : 2.3741329,
"_source" : {
"content" : "进击的巨人出第四季了哦"
},
"highlight" : {
"content" : [
"<font color=\"RED\">进击</font><font color=\"RED\">的</font>巨人出第四季了哦"
]
}
},
{
"_index" : "qvbilam",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.6249188,
"_source" : {
"content" : "牛魔王大战铁扇公主"
},
"highlight" : {
"content" : [
"牛<font color=\"RED\">魔王</font>大战铁扇公主"
]
}
},
{
"_index" : "qvbilam",
"_type" : "_doc",
"_id" : "4",
"_score" : 0.8663808,
"_source" : {
"content" : "zcx你可真是个二逼啊"
},
"highlight" : {
"content" : [
"zcx你可真是个<font color=\"RED\">二</font>逼啊"
]
}
}
]
}
}
自定义分词
例如进击的二滑大魔王
将无用的形容词单独设置为一个分词, 二滑这样出名的人物拆分成两两个词都是不正确的。通过自定义将的
忽略;二滑
设置为一个词
增加忽略配置文件
# vi ./elasticsearch/plugins/ik/config/custom_ignore.dic
的
是
啊
吗
增加自定义分词配置文件
# vi ./elasticsearch/plugins/ik/config/custom.dic
二滑
加载自定义配置
# vi ./elasticsearch/plugins/ik/config/IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom_ignore.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom_ignore.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<!-- <entry key="remote_ext_dict">words_location</entry> -->
<!--用户可以在这里配置远程扩展停止词字典-->
<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>
修改完配置后需要重启服务生效
测试
GET _analyze
{
"text": "进击的二滑大魔王",
"analyzer": "ik_max_word"
}
返回结果
{
"tokens" : [
{
"token" : "进击",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "二滑",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "二",
"start_offset" : 3,
"end_offset" : 4,
"type" : "TYPE_CNUM",
"position" : 2
},
{
"token" : "滑",
"start_offset" : 4,
"end_offset" : 5,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "大",
"start_offset" : 5,
"end_offset" : 6,
"type" : "CN_CHAR",
"position" : 4
},
{
"token" : "魔王",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 5
}
]
}