ElasticSeaech IK分词器介绍-阿里云开发者社区

一、IK 分词简介

1.1 概念：

分词：指把一段中文字符分成单个的关键字，这样搜索的时候可以根据我们搜索的关键字来进行分析，数据库的数据也会分词和我们搜索的信息做一个匹配，根据搜索的条件搜索到的话就会被召回。

1.2 开源Ik分词器的下载安装：

开源：

下载地址：(尽量保持和自建ES版本一致): https://github.com/medcl/elasticsearch-analysis-ik

安装方法：在自建ES的安装目录下有一个plugins的目录，下载ik分词器包后解压到 plugins/analysis-ik中（analysis-ik是需要手动创建的，名称符合规范即可）

阿里云：购买阿里云ES默认安装，而且不支持卸载

1.3 Ik分词器的分词方式：

IK分词器对中文具有良好支持的分词器，分词方式包括ik_max_word和ik_smart两种形式

默认分词效果(方便与分词器做对比做对比)

POST/_analyze{"text":"原世间美好与您环环相扣"}

{
"tokens" : [
    {
"token" : "原",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0    },
    {
"token" : "世",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1    },
    {
"token" : "间",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2    },
    {
"token" : "美",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3    },
    {
"token" : "好",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4    },
    {
"token" : "与",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5    },
    {
"token" : "您",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6    },
    {
"token" : "环",
"start_offset" : 7,
"end_offset" : 8,
"type" : "<IDEOGRAPHIC>",
"position" : 7    },
    {
"token" : "环",
"start_offset" : 8,
"end_offset" : 9,
"type" : "<IDEOGRAPHIC>",
"position" : 8    },
    {
"token" : "相",
"start_offset" : 9,
"end_offset" : 10,
"type" : "<IDEOGRAPHIC>",
"position" : 9    },
    {
"token" : "扣",
"start_offset" : 10,
"end_offset" : 11,
"type" : "<IDEOGRAPHIC>",
"position" : 10    }
  ]
}

ik_max_word 方式分词效果展示

POST/_analyze{"text":"原世间美好与您环环相扣","analyzer":"ik_max_word"}

{
"tokens" : [
    {
"token" : "原",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0    },
    {
"token" : "世间",
"start_offset" : 1,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 1    },
    {
"token" : "美好",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2    },
    {
"token" : "与",
"start_offset" : 5,
"end_offset" : 6,
"type" : "CN_CHAR",
"position" : 3    },
    {
"token" : "您",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 4    },
    {
"token" : "环环相扣",
"start_offset" : 7,
"end_offset" : 11,
"type" : "CN_WORD",
"position" : 5    },
    {
"token" : "环环",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 6    },
    {
"token" : "相",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 7    },
    {
"token" : "扣",
"start_offset" : 10,
"end_offset" : 11,
"type" : "CN_CHAR",
"position" : 8    }
  ]
}

ik_smart 方式分词效果展示

POST/_analyze{"text":"原世间美好与您环环相扣","analyzer":"ik_smart"}

{
"tokens" : [
    {
"token" : "原",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0    },
    {
"token" : "世间",
"start_offset" : 1,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 1    },
    {
"token" : "美好",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2    },
    {
"token" : "与",
"start_offset" : 5,
"end_offset" : 6,
"type" : "CN_CHAR",
"position" : 3    },
    {
"token" : "您",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 4    },
    {
"token" : "环环相扣",
"start_offset" : 7,
"end_offset" : 11,
"type" : "CN_WORD",
"position" : 5    }
  ]
}

二、IK分词在索引中的应用

2.1、创建索引指定字段分词和搜索方式

DELETEheqiangPUT/heqiang{
"settings":{
"index":{
"number_of_shards":1,
"number_of_replicas":2     } 
    }
}
POST/heqiang/_mapping{
"properties": {
"name": {
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"type": "text"          },
"age": {
"type": "keyword"          },
"description": {
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"type": "text"          }
     }
}
POST/heqiang/_doc/1{
"name":"张三",
"age":"21",
"description": "他的理想是一位伟大的工程师,从小和老师李四一起学习研究关于机械设计的理论知识"}
POST/heqiang/_doc/2{
"name":"李四",
"age":"22",
"description": "他是一名伟大的老师,教书育人无数"}
POST/heqiang/_doc/3{
"name":"王五",
"age":"22",
"description": "他年少习武,年纪轻轻就获取区联武术比赛赛第一名"}
POST/heqiang/_doc/4{
"name":"赵六",
"age":"23",
"description": "他饱读诗书,琴棋书画,样样精通,和李四在同一所大学教书"}
GET/heqiang/_doc/_search#名称匹配GET/heqiang/_doc/_search{
"query": {
"match": {
"name":"张三"        }
    }
}
#附带分页GET/heqiang/_doc/_search{
"from" : 0, "size" : 1,
"query": {
"match_all": {}
    },
"_source" : ["name","age"]
}
#批量id查询GET/heqiang/_doc/_search{
"query": {
"ids" : {
"type" : "_doc",
"values" : ["1", "2"]
        }
    }
}
#matchQuery即全文检索(表示只要有一个词匹配上就得分)
GET/heqiang/_doc/_search{
"query": {
"match" : {
"description" : {
"query" : "赵六样样精通",
"operator" : "or"            }
        }
    }
}
#MultiQuery匹配多个字段POST/heqiang/_doc/_search{
"query": {
"multi_match" : {
"query" : "王五",
"minimum_should_match": "50%",
"fields": [ "name", "description" ]
        }
    }
}
#布尔查询POST/heqiang/_doc/_search{
"_source" : [ "name", "age", "description"],
"from" : 0, "size" : 10,
"query": {
"bool" : {
"must":[
                    {
"multi_match" : {
"query" : "张三",
"minimum_should_match": "50%",
"fields": [ "name^10", "description" ]
                }
            },
            {
"term":{
"age" : "21"                    }
                }
            ]
        }
    }
}
#过滤器(注意：range和term一次只能对一个Field设置范围过虑)
POST/heqiang/_doc/_search{
"_source" : [ "name", "studymodel", "description","price"],
"query": {
"bool" : {
"must":[
                    {
"multi_match" : {
"query" : "他的理想是伟大的工程师",
"minimum_should_match": "50%",
"fields": [ "name^10", "description" ]
                        }
                    }
                ],
"filter": [
                { "term": { "name": "张三" }},
                { "range": { "age": { "gte": 20 ,"lte" : 25}}}
            ]
        }
    }
}
#根据年龄降序排列POST/heqiang/_doc/_search{
"_source" : [ "name", "description","age"],
"query": {
"match_all": {}
    },
"sort" : [
        {
"age" : "desc"        }
    ]
}
#查询全部，分页，倒叙排列POST/heqiang/_doc/_search{
"from" : 0, "size" : 1,
"query": {
"match_all": {}
  }
  , "sort": [
    {
"age": {
"order": "desc"      }
    }
  ]
}
#高亮显示POST/heqiang/_doc/_search{
"_source": [
"name",
"age",
"description"     ],
"query": {
"bool": {
"must": [
                    {
"multi_match": {
"query": "教书育人",
"minimum_should_match": "50%",
"fields": [
"name^10",
"description"                              ],
"type": "best_fields"                         }
                    }
               ],
"filter": [
                    {
"range": {
"age": {
"gte": 20,
"lte": 23                              }
                         }
                    }
               ]
          }
     },
"sort": [
          {
"age": "desc"          }
     ],
"highlight": {
"pre_tags": [
"<tag1>"          ],
"post_tags": [
"</tag2>"          ],
"fields": {
"name": {},
"description": {}
          }
     }
}

三、阿里云ES集群的IK分词插件(analysis-ik)

3.1 简介

IK分词插件（英文名为analysis-ik）是阿里云Elasticsearch的扩展插件，默认不能卸载。该插件在开源插件的基础上，扩展支持了对象存储服务OSS（Object Storage Service）词典文件的动态加载，可以实现IK词典的冷更新和热更新。

3.2 使用方式兼容开源，可以参考创建索引指定字段分词和搜索方式

注意事项：对于已经配置了IK分词的索引，在IK词典冷更新或热更新操作完成后将只对新数据（包含新增数据和更新后的数据）生效。如果您希望对全部数据生效，需要重建索引。

四、分词器对应词典的使用

4.1 开源IK分词器中词典使用方式

下载后解压内容如下

配置词典需要在config下的配置文件修改vi IKAnalyzer.cfg.xml

同级目录下直接配置词典名称即可

4.2 阿里云IK分词器中词典使用方式(如附件位置)

阿里云官方文档有介绍不做过多说明 https://help.aliyun.com/document_detail/137928.html#ik

五、IK分词常见问题

5.1 词典配置不生效常见原因

词典中有特殊字符
词典编码不是utf-8
更新词典文件后没有重建索引

ElasticSeaech IK分词器介绍

一、IK 分词简介

1.1 概念：

1.2 开源Ik分词器的下载安装：

1.3 Ik分词器的分词方式：

二、IK分词在索引中的应用

2.1、创建索引指定字段分词和搜索方式

三、阿里云ES集群的IK分词插件(analysis-ik)

3.1 简介

3.2 使用方式兼容开源，可以参考创建索引指定字段分词和搜索方式

四、分词器对应词典的使用

4.1 开源IK分词器中词典使用方式

4.2 阿里云IK分词器中词典使用方式(如附件位置)

五、IK分词常见问题

5.1 词典配置不生效常见原因

阿里云支持与服务

热门文章

最新文章

相关电子书

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

ElasticSeaech IK分词器介绍

一、IK 分词简介

1.1 概念：

1.2 开源Ik分词器的下载安装：

1.3 Ik分词器的分词方式：

二、IK分词在索引中的应用

2.1、创建索引指定字段分词和搜索方式

三、阿里云ES集群的IK分词插件(analysis-ik)

3.1 简介

3.2 使用方式兼容开源，可以参考 创建索引指定字段分词和搜索方式

四、分词器对应词典的使用

4.1 开源IK分词器中词典使用方式

4.2 阿里云IK分词器中词典使用方式(如附件位置)

五、IK分词常见问题

5.1 词典配置不生效常见原因

阿里云支持与服务

热门文章

最新文章

相关电子书

3.2 使用方式兼容开源，可以参考创建索引指定字段分词和搜索方式