1 语料库映射OpenAPI
环境准备:
- 先下载ik分词和pinyin分词,并放到esplugins相应目录中
请求kibana:GET /_cat/plugins?v&s=component&h=name,component,version,description
结果
name component version description WIN-A5KARTU1A65 analysis-ik 7.10.1 IK Analyzer for Elasticsearch WPhvS8c analysis-pinyin 7.10.1 Pinyin Analysis for Elasticsearch
- 定义ik分词后的pinyin分词器,即定义一个自定义分词器ik_pinyin_analyzer
PUT test_index { "settings":{ "number_of_shards":"1", "index.refresh_interval":"15s", "index":{ "analysis":{ "analyzer":{ "ik_pinyin_analyzer":{ "type":"custom", "tokenizer":"ik_smart", "filter":"pinyin_filter" } }, "filter":{ "pinyin_filter":{ "type":"pinyin", "keep_first_letter": false } } } } } }
下面的目的就是用API实现这种效果
这里tokenizer使用ik分词,分词之后将分词结果通过pinyin再filter一次,这样就可以了。
测试一下
POST test_index/_analyze { "analyzer": "ik_pinyin_analyzer", "text":"测试" }
结果
{ "tokens": [ { "token": "ce", "start_offset": 0, "end_offset": 2, "type": "CN_WORD", "position": 0 }, { "token": "shi", "start_offset": 0, "end_offset": 2, "type": "CN_WORD", "position": 1 } ] }
这样,当我们建立index的mapping的时候,就可以像使用ik_smart分词器一样使用ik_pinyin_analyzer
比如lawbasis字段的mapping可以是这样的
PUT test_index/_mapping/test_type { "properties": { "lawbasis":{ "type": "text", "analyzer": "ik_smart", "search_analyzer": "ik_smart", "fields": { "my_pinyin":{ "type":"text", "analyzer": "ik_pinyin_analyzer", "search_analyzer": "ik_pinyin_analyzer" } } } } }
其中field满足以不同的目的以不同的方式为相同的字段编制索引,也就是说lawbasis这个field会以中文ik_smart分词以及分词后的pinyin分词来编制索引,并支持中文和拼音搜索。
- 测试一下
加入两条数据
POST test_index/test_type { "lawbasis":"测试一下" } POST test_index/test_type { "lawbasis":"测试东西" }
使用拼音搜索
GET test_index/test_type/_search { "query":{ "match": { "lawbasis.my_pinyin": "ceshi" } } }
可以看到有两条结果
1.1 定义索引(映射)接口
package com.oldlu.service; import com.oldlu.commons.pojo.CommonEntity; import org.elasticsearch.rest.RestStatus; import java.util.List; import java.util.Map; /** * @Class: ElasticsearchIndexService * @Package com.oldlu.service * @Description: 索引操作接口 * @Company: oldlu */ public interface ElasticsearchIndexService { //新增索引+映射 boolean addIndexAndMapping(CommonEntity commonEntity) throws Exception; }
1.2 定义索引(映射)实现
/** * @Class: ElasticsearchIndexServiceImpl * @Package com.oldlu.service.impl * @Description: 索引操作实现类 * @Company: oldlu */ @Service("ElasticsearchIndexServiceImpl") public class ElasticsearchIndexServiceImpl implements ElasticsearchIndexService { @Resource private RestHighLevelClient client; private static final int START_OFFSET = 0; private static final int MAX_COUNT = 5; /* * @Description: 新增索引+setting+映射+自定义分词器pinyin * setting可以为空(自定义分词器pinyin在setting中) * 映射可以为空 * @Method: addIndexAndMapping * @Param: [commonEntity] * @Update: * @since: 1.0.0 * @Return: boolean * */ public boolean addIndexAndMapping(CommonEntity commonEntity) throws Exception { //设置setting的map Map<String, Object> settingMap = new HashMap<String, Object>(); //创建索引请求 CreateIndexRequest request = new CreateIndexRequest(commonEntity.getIndexName()); //获取前端参数 Map<String, Object> map = commonEntity.getMap(); //循环外层的settings和mapping for (Map.Entry<String, Object> entry : map.entrySet()) { if ("settings".equals(entry.getKey())) { if (entry.getValue() instanceof Map && ((Map) entry.getValue()).size() > 0) { request.settings((Map<String, Object>) entry.getValue()); } } if ("mapping".equals(entry.getKey())) { if (entry.getValue() instanceof Map && ((Map) entry.getValue()).size() > 0) { request.mapping((Map<String, Object>) entry.getValue()); } } } //创建索引操作客户端 IndicesClient indices = client.indices(); //创建响应对象 CreateIndexResponse response = indices.create(request, RequestOptions.DEFAULT); //得到响应结果 return response.isAcknowledged(); } }
1.3 新增控制器
package com.oldlu.controller; import com.oldlu.commons.enums.ResultEnum; import com.oldlu.commons.enums.TipsEnum; import com.oldlu.commons.pojo.CommonEntity; import com.oldlu.commons.result.ResponseData; import com.oldlu.service.ElasticsearchIndexService; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.*; /** * @Class: ElasticsearchIndexController * @Package com.oldlu.controller * @Description: 索引操作控制器 * @Company: oldlu */ @RestController @RequestMapping("v1/indices") public class ElasticsearchIndexController { private static final Logger logger = LoggerFactory .getLogger(ElasticsearchIndexController.class); @Autowired ElasticsearchIndexService elasticsearchIndexService; /* * @Description: 新增索引、映射 * @Method: addIndex * @Param: [commonEntity] * @Update: * @since: 1.0.0 * @Return: com.oldlu.commons.result.ResponseData * */ @PostMapping(value = "/add") public ResponseData addIndexAndMapping(@RequestBody CommonEntity commonEntity) { //构造返回数据 ResponseData rData = new ResponseData(); if (StringUtils.isEmpty(commonEntity.getIndexName())) { rData.setResultEnum(ResultEnum.PARAM_ISNULL); return rData; } //增加索引是否成功 boolean isSuccess = false; try { //通过高阶API调用增加索引方法 isSuccess = elasticsearchIndexService.addIndexAndMapping(commonEntity ); //构建返回信息通过类型推断自动装箱(多个参数取交集) rData.setResultEnum(isSuccess, ResultEnum.SUCCESS, 1); //日志记录 logger.info(TipsEnum.CREATE_INDEX_SUCCESS.getMessage()); } catch (Exception e) { //日志记录 logger.error(TipsEnum.CREATE_INDEX_FAIL.getMessage(), e); //构建错误返回信息 rData.setResultEnum(ResultEnum.ERROR); } return rData; } }
1.4 开始新增映射
http://172.17.0.225:8888/v1/indices/add
或者
http://127.0.0.1:8888/v1/indices/add
参数
自定义分词器ik_pinyin_analyzer(ik和pinyin组合分词器)
tips 在创建映射前,需要安装拼音插件
{ "indexName": "product_completion_index", "map": { "settings": { "number_of_shards": 1, "number_of_replicas": 2, "analysis": { "analyzer": { "ik_pinyin_analyzer": { "type": "custom", "tokenizer": "ik_smart", "filter": "pinyin_filter" } }, "filter": { "pinyin_filter": { "type": "pinyin", "keep_first_letter": true, "keep_separate_first_letter": false, "keep_full_pinyin": true, "keep_original": true, "limit_first_letter_length": 16, "lowercase": true, "remove_duplicated_term": true } } } }, "mapping": { "properties": { "name": { "type": "keyword" }, "searchkey": { "type": "completion", "analyzer": "ik_pinyin_analyzer" } } } } }
settings下面的为索引的设置信息,动态设置参数,遵循DSL写法
mapping下为映射的字段信息,动态设置参数,遵循DSL写法
返回
{ "code": "200", "desc": "操作成功!", "data": true }
2 语料库文档OpenAPI
2.1 定义批量新增文档接口
package com.oldlu.service; import com.oldlu.commons.pojo.CommonEntity; import org.elasticsearch.action.DocWriteResponse; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.suggest.completion.CompletionSuggestion; import java.util.List; import java.util.Map; /** * @Class: ElasticsearchDocumentService * @Package com.oldlu.service * @Description: 文档操作接口 * @Company: */ public interface ElasticsearchDocumentService { //批量新增文档 public RestStatus bulkAddDoc(CommonEntity commonEntity) throws Exception; }
2.2 定义批量新增文档实现
/* * @Description: 批量新增文档,可自动创建索引、自动创建映射 * @Method: bulkAddDoc * @Param: [indexName, map] * @Update: * @since: 1.0.0 * @Return: org.elasticsearch.rest.RestStatus * */ @Override public RestStatus bulkAddDoc(CommonEntity commonEntity) throws Exception { //通过索引构建批量请求对象 BulkRequest bulkRequest = new BulkRequest(commonEntity.getIndexName()); //循环前台list文档数据 for (int i = 0; i < commonEntity.getList().size(); i++) { bulkRequest.add(new IndexRequest().source(XContentType.JSON, SearchTools.mapToObjectGroup(commonEntity.getList().get(i)))); } //执行批量新增 BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT); return bulkResponse.status(); }
官方文档介绍https://www.elastic.co/guide/en/elasticsearch/client/java-rest/7.4/java-rest-high-document-bulk.html
如上图,需要定义成箭头中的形式
所以上面SearchTools.mapToObjectGroup将map转成了数组
2.3 定义批量新增文档控制器
/* * @Description: 批量新增文档,可自动创建索引、自动创建映射 * @Method: bulkAddDoc * @Param: [indexName, map] * @Update: * @since: 1.0.0 * @Return: org.elasticsearch.rest.RestStatus * */ @PostMapping(value = "/batch") public ResponseData bulkAddDoc(@RequestBody CommonEntity commonEntity) { //构造返回数据 ResponseData rData = new ResponseData(); if (StringUtils.isEmpty(commonEntity.getIndexName()) || CollectionUtils.isEmpty(commonEntity.getList())) { rData.setResultEnum(ResultEnum.PARAM_ISNULL); return rData; } //批量新增操作返回结果 RestStatus result = null; try { //通过高阶API调用批量新增操作方法 result = elasticsearchDocumentService.bulkAddDoc(commonEntity); //通过类型推断自动装箱(多个参数取交集) rData.setResultEnum(result, ResultEnum.SUCCESS, null); //日志记录 logger.info(TipsEnum.BATCH_CREATE_DOC_SUCCESS.getMessage()); } catch (Exception e) { //日志记录 logger.info(TipsEnum.BATCH_CREATE_DOC_FAIL.getMessage(), e); //构建错误返回信息 rData.setResultEnum(ResultEnum.ERROR); } return rData; }
2.4 开始批量新增调用
http://172.17.0.225:8888/v1/docs/batch
或者
http://127.0.0.1:8888/v1/docs/batch
参数
定义23个suggest词库(定义了两个小米手机,验证是否去重)
{ "indexName": "product_completion_index", "list": [ { "searchkey": "小米手机", "name": "小米(MI)" }, { "searchkey": "小米10", "name": "小米(MI)" }, { "searchkey": "小米电视", "name": "小米(MI)" }, { "searchkey": "小米路由器", "name": "小米(MI)" }, { "searchkey": "小米9", "name": "小米(MI)" }, { "searchkey": "小米手机", "name": "小米(MI)" }, { "searchkey": "小米耳环", "name": "小米(MI)" }, { "searchkey": "小米8", "name": "小米(MI)" }, { "searchkey": "小米10Pro", "name": "小米(MI)" }, { "searchkey": "小米笔记本", "name": "小米(MI)" }, { "searchkey": "小米摄像头", "name": "小米(MI)" }, { "searchkey": "小米电饭煲", "name": "小米(MI)" }, { "searchkey": "小米充电宝", "name": "小米(MI)" }, { "searchkey": "adidas男鞋", "name": "adidas男鞋" }, { "searchkey": "adidas女鞋", "name": "adidas女鞋" }, { "searchkey": "adidas外套", "name": "adidas外套" }, { "searchkey": "adidas裤子", "name": "adidas裤子" }, { "searchkey": "adidas官方旗舰店", "name": "adidas官方旗舰店" }, { "searchkey": "阿迪达斯袜子", "name": "阿迪达斯袜子" }, { "searchkey": "阿迪达斯外套", "name": "阿迪达斯外套" }, { "searchkey": "阿迪达斯运动鞋", "name": "阿迪达斯运动鞋" }, { "searchkey": "耐克外套", "name": "耐克外套" }, { "searchkey": "耐克运动鞋", "name": "耐克运动鞋" } ] }
返回
{ "code": "200", "desc": "操作成功!", "data": "OK" }
查看GET product_completion_index/_search