SpringBoot2.3.x整合ElasticSearch7.6.2 实现PDF,WORD全文检索

2022-06-02 1615

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： 本文使用SpringBoot2.3.x + ElasticSearch7.6.2 实现对PDF,WORD进行全文检索实现了对文件内容快速搜索

1、下载安装，只下载elasticSearch、Kibana即可

下载安装参考Springboot/Springcloud整合ELK平台，（Filebeat方式）日志采集及管理（Elasticsearch+Logstash+Filebeat+Kibana）
elastic中文社区下载地址

这里我使用7.6.2的elasticsearch版本，因为项目使用的springboot2.3.x,避免低版本客户端，高版本索引库·，这里我先退回使用低版本索引库

插件安装

ik 分词器

在这里插入图片描述

ingest-attachment 这里将链接修改为自己的版本即可

插件下载完成之后，将压缩包解压到 elasticsearch的plugins目录，之后重启elasticsearch

定义文本抽取管道

PUT /_ingest/pipeline/attachment
{
 "description" : "Extract attachment information",
 "processors":[
 {
    "attachment":{

        "field":"data",

        "indexed_chars" : -1,

        "ignore_missing":true
     }
 },
 {
     "remove":{"field":"data"}
 }]}

2、SpringBoot整合ElasticSearch

<dependencies>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-test</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.58</version>
    </dependency>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.20</version>
    </dependency>
</dependencies>

application.yml

server:
  port: 9090
spring:
  application:
    name: elasticsearch-service
  elasticsearch:
    rest:
      uris: http://127.0.0.1:9200

实体类

package top.fate.entity;

import lombok.Data;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;

/**
 * @auther:Wangxl
 * @Emile:18335844494@163.com
 * @Time:2020/11/2 14:15
 */
@Data
@Document(indexName = "filedata")
public class FileData {

    @Field(type = FieldType.Keyword)
    private String filePk;
    @Field(type = FieldType.Keyword)
    private String fileName;
    @Field(type = FieldType.Keyword)
    private Integer page;
    @Field(type = FieldType.Keyword)
    private String departmentId;
    @Field(type = FieldType.Keyword)
    private String ljdm;
    @Field(type = FieldType.Text, analyzer = "ik_max_word")
    private String data;
    @Field(type = FieldType.Keyword)
    private String realName;
    @Field(type = FieldType.Keyword)
    private String url;
    @Field(type = FieldType.Keyword)
    private String type;
}

接口类

package top.fate.controller;

import com.alibaba.fastjson.JSON;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
import org.springframework.data.elasticsearch.core.IndexOperations;
import org.springframework.data.elasticsearch.core.document.Document;
import org.springframework.data.elasticsearch.core.mapping.IndexCoordinates;
import org.springframework.util.Base64Utils;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import top.fate.entity.FileData;

import java.io.File;
import java.io.FileInputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * @auther:Wangxl
 * @Emile:18335844494@163.com
 * @Time:2022/6/1 16:33
 */
@RestController
@RequestMapping(value = "fullTextSearch")
public class FullTextSearchController {
    @Autowired
    private ElasticsearchRestTemplate elasticsearchRestTemplate;
    @Autowired
    private RestHighLevelClient restHighLevelClient;

    @GetMapping("createIndex")
    public void add() {

        IndexOperations indexOperations = elasticsearchRestTemplate.indexOps(IndexCoordinates.of("testindex"));
        indexOperations.create();
        Document mapping = indexOperations.createMapping(FileData.class);
        indexOperations.putMapping(mapping);
    }

    @GetMapping("deleteIndex")
    public void deleteIndex() {
        IndexOperations indexOperations = elasticsearchRestTemplate.indexOps(FileData.class);
        indexOperations.delete();
    }

    @GetMapping("uploadFileToEs")
    public void uploadFileToEs() {

        try {
//            File file = new File("D:\\desktop\\Java开发工程师-4年-王晓龙-2022-05.pdf");
            File file = new File("D:\\desktop\\Java开发工程师-4年-王晓龙-2022-05.docx");
            FileInputStream inputFile = new FileInputStream(file);
            byte[] buffer = new byte[(int)file.length()];
            inputFile.read(buffer);
            inputFile.close();
            //将文件转成base64编码
            String fileString = Base64Utils.encodeToString(buffer);

            FileData fileData = new FileData();
            fileData.setFileName(file.getName());
            fileData.setFilePk(file.getName());
            fileData.setData(fileString);

            IndexRequest indexRequest = new IndexRequest("testindex").id(fileData.getFilePk());
            indexRequest.source(JSON.toJSONString(fileData),XContentType.JSON);
            indexRequest.setPipeline("attachment");

            IndexResponse index = restHighLevelClient.index(indexRequest, RequestOptions.DEFAULT);

            return;

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @GetMapping("search")
    public Object search(@RequestParam("txt") String txt) {
        List list = new ArrayList();
        try {
            SearchRequest searchRequest = new SearchRequest("testindex");

            SearchSourceBuilder builder = new SearchSourceBuilder();

            builder.query(QueryBuilders.matchQuery("attachment.content",txt).analyzer("ik_max_word"));

            searchRequest.source(builder);


            // 返回实际命中数
            builder.trackTotalHits(true);
            //高亮
            HighlightBuilder highlightBuilder = new HighlightBuilder();
            highlightBuilder.field("attachment.content");
            highlightBuilder.requireFieldMatch(false);//多个高亮关闭
            highlightBuilder.preTags("<span style='color:red'>");
            highlightBuilder.postTags("</span>");
            builder.highlighter(highlightBuilder);

            SearchResponse search = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);

            if (search.getHits() != null) {

                for (SearchHit documentFields : search.getHits().getHits()) {
                    Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
                    HighlightField title = highlightFields.get("attachment.content");
                    Map<String, Object> sourceAsMap = documentFields.getSourceAsMap();
                    if (title != null) {
                        Text[] fragments = title.fragments();
                        String n_title = "";
                        for (Text fragment : fragments) {
                            n_title += fragment;
                        }
                        sourceAsMap.put("data", n_title);
                    }
                    list.add(dealObject(sourceAsMap,  FileData.class));
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return list;
    }
    /*public static void ignoreSource(Map<String, Object> map) {
        for (String key : IGNORE_KEY) {
            map.remove(key);
        }
    }*/

    public static <T> T dealObject(Map<String, Object> sourceAsMap, Class<T> clazz) {
        try {
//            ignoreSource(sourceAsMap);
            Iterator<String> keyIterator = sourceAsMap.keySet().iterator();
            T t = clazz.newInstance();
            while (keyIterator.hasNext()) {
                String key = keyIterator.next();
                String replaceKey = key.replaceFirst(key.substring(0, 1), key.substring(0, 1).toUpperCase());
                Method method = null;
                try {
                    method = clazz.getMethod("set" + replaceKey, sourceAsMap.get(key).getClass());
                } catch (NoSuchMethodException e) {
                    continue;
                }
                method.invoke(t, sourceAsMap.get(key));
            }
            return t;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }
}

测试

创建索引

 localhost:9090/fullTextSearch/createIndex

在这里插入图片描述

上传文档

localhost:9090/fullTextSearch/uploadFileToEs

在这里插入图片描述

搜索

localhost:9090/fullTextSearch/search?txt=索引库

在这里插入图片描述

SpringBoot2.3.x整合ElasticSearch7.6.2 实现PDF,WORD全文检索

1、下载安装，只下载elasticSearch、Kibana即可

插件安装

定义文本抽取管道

2、SpringBoot整合ElasticSearch

application.yml

实体类

接口类

测试

创建索引

上传文档

搜索

热门文章

最新文章

相关课程

相关电子书

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

SpringBoot2.3.x整合ElasticSearch7.6.2 实现PDF,WORD全文检索

1、下载安装，只下载elasticSearch、Kibana即可

插件安装

定义文本抽取管道

2、SpringBoot整合ElasticSearch

application.yml

实体类

接口类

测试

创建索引

上传文档

搜索

热门文章

最新文章

相关课程

相关电子书