在使用ES做查询的时候,为了获取更加准确地匹配查询结果,需要自定义与业务相关的分词词典。比如汽车行业的一些专业用词:奔驰AMG、宝马X5......
假如不自定义分词的话,默认“奔驰AMG”是会被切分成“奔驰”和“AMG”的,那么意味着所有和“奔驰”或“AMG”相关的数据都会被查询出来,显然所有的“奔驰”并不是我们的目标数据。
根据以上问题描述,我们需要添加自定义分词。通过查看IK文档,发现可以做成热更新:
1.数据表设计
CREATE TABLE `brahma_tab_ik_ext_word` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id', `word` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '扩展词', `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间', PRIMARY KEY (`id`), UNIQUE KEY `UNIQUE_WORD` (`word`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='ik分词扩展词典'; 复制代码
2.编码
jpa
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.domain.Specification; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.data.repository.PagingAndSortingRepository; import org.springframework.stereotype.Repository; /** @author zouwei */ @Repository public interface IkExtWordRepository extends JpaRepository<BrahmaTabIkExtWordEntity, Integer>, PagingAndSortingRepository<BrahmaTabIkExtWordEntity, Integer> { /** * 查询分词记录 * * @param word * @return */ BrahmaTabIkExtWordEntity findFirstByWord(String word); /** * 条件查询 * * @param spec * @param pageable * @return */ Page<BrahmaTabIkExtWordEntity> findAll( Specification<BrahmaTabIkExtWordEntity> spec, Pageable pageable); } 复制代码
service
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity; import com.yx.brahma.persistence.repository.IkExtWordRepository; import com.yx.brahma.service.ServiceException; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.PageRequest; import org.springframework.data.domain.Pageable; import org.springframework.data.domain.Sort; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import javax.persistence.criteria.CriteriaBuilder; import javax.persistence.criteria.CriteriaQuery; import javax.persistence.criteria.Predicate; import javax.persistence.criteria.Root; import java.sql.Timestamp; import java.util.List; import java.util.Objects; import java.util.Optional; /** @author zouwei */ @Slf4j @Service public class IKAnalyzerService { @Autowired private IkExtWordRepository ikExtWordRepository; /** * 获取全部分词 * * @return */ public List<BrahmaTabIkExtWordEntity> allWorld() { List<BrahmaTabIkExtWordEntity> list = ikExtWordRepository.findAll(Sort.by(Sort.Direction.DESC, "updateTime")); return list; } /** * 获取分词更新时间 * * @return */ public Timestamp getLastModifiedTime() { Page<BrahmaTabIkExtWordEntity> ikExtWordPage = ikExtWordRepository.findAll( PageRequest.of(0, 1, Sort.by(Sort.Direction.DESC, "updateTime"))); long total = ikExtWordPage.getTotalElements(); if (total > 0) { BrahmaTabIkExtWordEntity ikExtWord = ikExtWordPage.getContent().get(0); return ikExtWord.getUpdateTime(); } return new Timestamp(System.currentTimeMillis()); } /** * 添加分词 * * @param word * @return * @throws ServiceException */ @Transactional(rollbackFor = ServiceException.class) public BrahmaTabIkExtWordEntity addWord(String word) throws ServiceException { BrahmaTabIkExtWordEntity ikExtWordEntity = ikExtWordRepository.findFirstByWord(word); if (Objects.nonNull(ikExtWordEntity)) { throw new ServiceException("已经存在" + word); } return ikExtWordRepository.save(new BrahmaTabIkExtWordEntity(word)); } /** * 删除指定分词 * * @param id */ @Transactional(rollbackFor = ServiceException.class) public void deleteWord(Integer id) { ikExtWordRepository.deleteById(id); } /** * 查询指定分词 * * @param word * @return */ public BrahmaTabIkExtWordEntity findByWord(String word) { return ikExtWordRepository.findFirstByWord(word); } /** * 条件查询 * * @param search * @param page * @param size * @return */ public Page<BrahmaTabIkExtWordEntity> findBySearchKey(String search, int page, int size) { page = page - 1; if (page < 0) { page = 0; } if (size <= 0) { size = 10; } Pageable pageable = PageRequest.of(page, size, Sort.by(Sort.Direction.DESC, "updateTime")); return ikExtWordRepository.findAll( (Root<BrahmaTabIkExtWordEntity> root, CriteriaQuery<?> query, CriteriaBuilder criteriaBuilder) -> { if (StringUtils.isNoneBlank(search)) { String searchStr = "%" + search + "%"; Predicate result = criteriaBuilder.like(root.get("word"), searchStr); query.where(result); } return null; }, pageable); } /** * 更新指定分词 * * @param wordEntity * @return * @throws ServiceException */ @Transactional(rollbackFor = ServiceException.class) public BrahmaTabIkExtWordEntity updateExtWord(BrahmaTabIkExtWordEntity wordEntity) throws ServiceException { Integer id = wordEntity.getId(); String word = wordEntity.getWord(); if (Objects.isNull(id)) { throw new ServiceException("id不能为空"); } if (StringUtils.isBlank(word)) { throw new ServiceException("分词不能修改为空"); } Optional<BrahmaTabIkExtWordEntity> result = ikExtWordRepository.findById(id); if (result.isPresent()) { BrahmaTabIkExtWordEntity data = result.get(); data.setUpdateTime(new Timestamp(System.currentTimeMillis())); data.setWord(word); return ikExtWordRepository.save(data); } throw new ServiceException("不存在这个id:" + id); } } 复制代码
controller
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity; import com.yx.brahma.service.ServiceException; import com.yx.brahma.service.elasticsearch.ik.IKAnalyzerService; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.util.CollectionUtils; import org.springframework.web.bind.annotation.*; import pers.roamer.boracay.helper.HttpResponseHelper; import javax.servlet.http.HttpServletResponse; import java.sql.Timestamp; import java.text.DateFormat; import java.util.Date; import java.util.List; import java.util.StringJoiner; /** @author zouwei */ @Slf4j @RestController public class IKAnalyzerController { @Autowired private IKAnalyzerService ikAnalyzerService; /** * 获取所有分词 * * @return */ @GetMapping( value = "/ik/search", produces = {"text/plain;charset=utf-8"}) public String searchByIKPlugin() { log.info("获取最新分词词典"); StringJoiner stringJoiner = new StringJoiner(StringUtils.LF); List<BrahmaTabIkExtWordEntity> list = ikAnalyzerService.allWorld(); if (CollectionUtils.isEmpty(list)) { return StringUtils.EMPTY; } list.forEach(e -> stringJoiner.add(e.getWord())); return stringJoiner.toString(); } /** * 检测是否需要请求分词 * * @param response */ @RequestMapping(value = "/ik/search", method = RequestMethod.HEAD) public void headAllHotWord(HttpServletResponse response) { log.info("检测是否需要更新分词词典"); Timestamp lastModifiedTime = ikAnalyzerService.getLastModifiedTime(); response.setHeader( "Last-Modified", DateFormat.getInstance().format(new Date(lastModifiedTime.getTime()))); } /** * 添加分词 * * @param word * @return * @throws ControllerException */ @PostMapping("/extDict/add") public String addWord(@RequestParam("word") String word) throws ControllerException { try { return HttpResponseHelper.successInfoInbox(ikAnalyzerService.addWord(word)); } catch (ServiceException e) { throw new ControllerException(e.getMessage()); } } /** * 删除指定分词 * * @param id */ @DeleteMapping("/extDict/delete/{id}") public void deleteWord(@PathVariable("id") Integer id) { ikAnalyzerService.deleteWord(id); } /** * 条件查询 * * @param search * @param page * @param size * @return */ @GetMapping("/extDict/search") public String findBySearch(String search, int page, int size) { return HttpResponseHelper.successInfoInbox( ikAnalyzerService.findBySearchKey(search, page, size)); } /** * 更新指定分词 * * @param wordEntity * @return * @throws ControllerException */ @PutMapping("/extDict/update") public String updateWord(@RequestBody BrahmaTabIkExtWordEntity wordEntity) throws ControllerException { try { return HttpResponseHelper.successInfoInbox(ikAnalyzerService.updateExtWord(wordEntity)); } catch (ServiceException e) { throw new ControllerException(e.getMessage()); } } } 复制代码
使用ngrok内网传统测试,修改一下IK的IKAnalyzer.cfg.xml配置文件:
<entry key="remote_ext_dict">http://d75c5cc4.ngrok.io/ik/search</entry>
注意要把注释删掉