(一)、搭建环境
0.启动ElasticSearch和head和kblian
(1).启动EslaticSearch (9200)
(2).启动Es-head (9101)
(3).启动 Kibana (5602)
1.项目依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.5.5</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.jsxs</groupId> <artifactId>Jsxs-es-JD</artifactId> <version>0.0.1-SNAPSHOT</version> <name>Jsxs-es-JD</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> <!-- 自己定义es版本依赖,保证和本地一致 --> <elasticsearch.version>7.6.2</elasticsearch.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <scope>runtime</scope> <optional>true</optional> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <version>2.7.9</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- 引入我们的JSON包 --> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>2.0.26</version> </dependency> <!-- 引入Thymeleaf启动器 --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> <version>2.7.7</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> <configuration> <excludes> <exclude> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> </exclude> </excludes> </configuration> </plugin> </plugins> </build> </project>
2.启动测试
(二)、爬虫
1.数据从哪里获取
- 数据库获取。
- 消息队列中获取中。
- 爬虫
2.导入爬虫的依赖
tika包解析电影的.jsoup解析网页
<!-- jsoup解析网页--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
3.编写爬虫工具类
(1).实体类
package com.jsxs.pojo; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; /** * @Author Jsxs * @Date 2023/6/30 13:06 * @PackageName:com.jsxs.pojo * @ClassName: Content * @Description: TODO * @Version 1.0 */ @Data @NoArgsConstructor @AllArgsConstructor public class Content { private String title; private String img; private String price; }
(2).工具类编写 (已废弃⭐)
package com.jsxs.utils; import com.jsxs.pojo.Content; import org.elasticsearch.common.recycler.Recycler; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Component; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; /** * @Author Jsxs * @Date 2023/6/30 12:40 * @PackageName:com.jsxs.utils * @ClassName: HtmlParseUtil * @Description: TODO * @Version 1.0 */ @Component public class HtmlParseUtil { public List<Content> parseJD(String keywords) throws Exception { // 1.获得请求 String url = "https://search.jd.com/Search?keyword="+keywords; // 2.解析网页 返回的document对象就是浏览器的Document对象 Document document = Jsoup.parse(new URL(url), 3000); // 3.利用js的Document对象进行操作 ->获取商品整个html页面 Element element = document.getElementById("J_goodsList"); // 4.获取所有的li元素 是一个集合。 Elements elements = element.getElementsByTag("li"); // 创建一个链表,用于存放我们爬取到的信息 ArrayList<Content> contents = new ArrayList<>(); // 5.获取元素中的各个内容 for (Element li : elements) { // 获取图片 这里面加上attr目的是懒加载。 String img = li.getElementsByTag("img").eq(0).attr("data-lazy-img"); // 爬取懒加载的图片 // 获取价格 String price = li.getElementsByClass("p-price").eq(0).text(); // 获取上坪的价格 String title = li.getElementsByClass("p-name").eq(0).text(); // 存放我们爬取到的信息 contents.add(new Content(title,img,price)); } return contents; } public static void main(String[] args) throws Exception { for (Content java : new HtmlParseUtil().parseJD("码出高效")) { System.out.println(java); } } }