(3).工具类编写 - 解决京东防护
package com.jsxs.utils; import com.jsxs.pojo.Content; import org.elasticsearch.common.recycler.Recycler; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Component; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * @Author Jsxs * @Date 2023/6/30 12:40 * @PackageName:com.jsxs.utils * @ClassName: HtmlParseUtil * @Description: TODO * @Version 1.0 */ @Component public class HtmlParseUtil { public List<Content> parseJD(String keywords) throws Exception { // 1.获得请求 String url = "https://search.jd.com/Search?keyword="+keywords; System.out.println(url); // 设置cookie Map<String, String> cookies = new HashMap<String, String>(); cookies.put("thor", "35C0A430DD191386DC5C6605461B820975545DB4E7B5F6CD3717B58D8F3B4CF548ED5F724A0CFF52528BCC4C1382E38FDD39F7714D356D73C80DBC98E351588E74A77B0CB8B5348847042F8AB08B9D4BC87539F45579E34614217BFD76FCEEBEC829173EEA7B4D51FAA162DD62B98376375C46B24B2FAAC96C7C733BC0F3B6165DB89F97C62170FD0838A7F72212B95CD38FC61DEF2B38C36A1F8C252C2809C8"); // 2.解析网页 返回的document对象就是浏览器的Document对象 Document document = Jsoup.connect(url).cookies(cookies).get(); // 3.利用js的Document对象进行操作 ->获取商品整个html页面 Element element = document.getElementById("J_goodsList"); System.out.println("***************"+element); // 4.获取所有的li元素 是一个集合。 Elements elements = element.getElementsByTag("li"); // 创建一个链表,用于存放我们爬取到的信息 ArrayList<Content> contents = new ArrayList<>(); // 5.获取元素中的各个内容 for (Element li : elements) { // 获取图片 这里面加上attr目的是懒加载。 String img = li.getElementsByTag("img").eq(0).attr("data-lazy-img"); // 爬取懒加载的图片 // 获取价格 String price = li.getElementsByClass("p-price").eq(0).text(); // 获取上坪的价格 String title = li.getElementsByClass("p-name").eq(0).text(); // 存放我们爬取到的信息 contents.add(new Content(title,img,price)); } return contents; } public static void main(String[] args) throws Exception { for (Content java : new HtmlParseUtil().parseJD("java")) { System.out.println(java); } } }
4.导入配置类
package com.jsxs.config; import org.apache.http.HttpHost; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; /** * @Author Jsxs * @Date 2023/6/30 14:13 * @PackageName:com.jsxs.config * @ClassName: ElasticSearchClientConfig * @Description: TODO * @Version 1.0 */ @Configuration public class ElasticSearchClientConfig { @Bean public RestHighLevelClient restHighLevelClient(){ RestHighLevelClient client = new RestHighLevelClient( RestClient.builder( new HttpHost("localhost", 9200, "http"))); return client; } }
(三)、将爬取到的数据存放到ES
1.创建Service层
ContentService.java
package com.jsxs.service; import com.alibaba.fastjson2.JSON; import com.jsxs.pojo.Content; import com.jsxs.utils.HtmlParseUtil; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.client.indices.CreateIndexRequest; import org.elasticsearch.common.xcontent.XContentType; import org.springframework.stereotype.Service; import javax.annotation.Resource; import java.util.List; /** * @Author Jsxs * @Date 2023/6/30 14:08 * @PackageName:com.jsxs.service * @ClassName: ContentService * @Description: TODO * @Version 1.0 */ @Service public class ContentService { @Resource RestHighLevelClient client; public static void main(String[] args) throws Exception { System.out.println(new ContentService().parseContent("java")); } // 1.解析数据放入我们的es索引中 public Boolean parseContent(String keywords) throws Exception { List<Content> list = new HtmlParseUtil().parseJD(keywords); // 2. 把查询到的数据批量放入es中去 BulkRequest bulkRequest = new BulkRequest(); // 3.设置超时的时间 bulkRequest.timeout("2s"); // 4.创建一个新的索引名字叫做 jd_goods ⭐⭐运行第二次的时候,要把创建库的语句给删除掉 CreateIndexRequest request = new CreateIndexRequest("jd_goods"); client.indices().create(request, RequestOptions.DEFAULT); // 5.批量插入到数据中 并设置id。 for (int i = 0; i < list.size(); i++) { bulkRequest.add(new IndexRequest("jd_goods") .id(""+i+1) .source(JSON.toJSONString(list.get(i)), XContentType.JSON) ); } BulkResponse bulk = client.bulk(bulkRequest, RequestOptions.DEFAULT); // 如果没有失败就返回成功 return !bulk.hasFailures(); } }
2.进行测试 (ES是否存放成功)
package com.jsxs; import com.jsxs.service.ContentService; import org.junit.jupiter.api.Test; import org.springframework.boot.test.context.SpringBootTest; import javax.annotation.Resource; @SpringBootTest class JsxsEsJdApplicationTests { @Resource ContentService contentService; @Test void contextLoads() throws Exception { System.out.println(contentService.parseContent("java")); } }
(四)、从ES中分页读取数据 (关键字不能为中文)
切记我们只能读取到我们ES中存放的数据,假如进行查询没有存放在ES的数据,我们就会得到空的数据。
1.从ES中读取数据
(1).ContentService 层
package com.jsxs.service; import com.alibaba.fastjson2.JSON; import com.jsxs.pojo.Content; import com.jsxs.utils.HtmlParseUtil; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.client.indices.CreateIndexRequest; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.springframework.stereotype.Service; import javax.annotation.Resource; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; /** * @Author Jsxs * @Date 2023/6/30 14:08 * @PackageName:com.jsxs.service * @ClassName: ContentService * @Description: TODO * @Version 1.0 */ @Service public class ContentService { @Resource RestHighLevelClient client; // 1.解析数据放入我们的es索引中 public Boolean parseContent(String keywords) throws Exception { List<Content> list = new HtmlParseUtil().parseJD(keywords); // 2. 把查询到的数据批量放入es中去 BulkRequest bulkRequest = new BulkRequest(); // 3.设置超时的时间 bulkRequest.timeout("2s"); // 4.创建一个新的索引名字叫做 jd_goods // CreateIndexRequest request = new CreateIndexRequest("jd_goods"); // client.indices().create(request, RequestOptions.DEFAULT); // 5.批量插入到数据中 并设置id。 for (int i = 0; i < list.size(); i++) { bulkRequest.add(new IndexRequest("jd_goods") .id(i+1+"") .source(JSON.toJSONString(list.get(i)), XContentType.JSON) ); } BulkResponse bulk = client.bulk(bulkRequest, RequestOptions.DEFAULT); // 如果没有失败就返回成功 return !bulk.hasFailures(); } // 2. 从ES中进行搜索内容 public List<Map<String,Object>> searchesPage(String keywords,int pageNo,int pageSize) throws IOException { if (pageNo<=1){ pageNo=1; } // 1.条件搜索 ⭐ SearchRequest request = new SearchRequest("jd_goods"); // 2.构建搜索条件 ⭐⭐ SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); // 3.分页 ⭐⭐⭐ searchSourceBuilder.from(pageNo); searchSourceBuilder.size(pageSize); // 4. 精确匹配: 第一个参数是参数列名,第二个参数是 搜索的内容 ⭐⭐⭐⭐ TermQueryBuilder query = QueryBuilders.termQuery("title", keywords); searchSourceBuilder.query(query); searchSourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS)); // 5.执行搜索 ⭐⭐⭐⭐⭐ request.source(searchSourceBuilder); SearchResponse searchResponse = client.search(request, RequestOptions.DEFAULT); //这里会得到一个结果 // 6.解析结果 ⭐⭐⭐⭐⭐⭐ SearchHits hits = searchResponse.getHits(); // 这里会获取到一个对象,对象里面包含着一个hits数组 ArrayList<Map<String,Object>> list = new ArrayList<>(); for (SearchHit hit : searchResponse.getHits().getHits()) { list.add(hit.getSourceAsMap()); } System.out.println(list); return list; } }
(2).ContentController 控制层
package com.jsxs.controller; import com.jsxs.service.ContentService; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RestController; import javax.annotation.Resource; import java.io.IOException; import java.util.List; import java.util.Map; /** * @Author Jsxs * @Date 2023/6/30 14:08 * @PackageName:com.jsxs.controller * @ClassName: ContentController * @Description: TODO * @Version 1.0 */ @RestController public class ContentController { @Resource private ContentService contentService; // 普通查询数据 @GetMapping("/parse/{keywords}") public Boolean parse(@PathVariable("keywords") String keywords) throws Exception { return contentService.parseContent(keywords); } // 分页查询数据加高亮 @GetMapping("/search/{keyword}/{pageNo}/{pageSize}") public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize") int pageSize) throws IOException { return contentService.searchesPage(keyword,pageNo,pageSize); } // }
2.错误演示 (读取es中没有的数据)
1. 我们在ES中存放的关键字是 java 而我们读取的关键字是 夏装
2. 读取不到夏装的数据