打开gitee看看最有价值的开源项目,然后发现排序是这样的。
这我怎么知道那个是最火的,还好我是程序员,主力语言java,必须自己爬一下。
一、导入依赖库
<!-- 爬虫库--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.3</version> </dependency> <!-- 导出excel--> <dependency> <groupId>com.alibaba</groupId> <artifactId>easyexcel</artifactId> <version>3.0.5</version> </dependency>
二、编写业务代码
导出excel对象
package com.example.demo.util; import com.alibaba.excel.annotation.ExcelProperty; import com.alibaba.excel.annotation.write.style.ColumnWidth; public class Model implements Comparable<Model>{ @ExcelProperty("项目语言") @ColumnWidth(20) String projectLabels; @ExcelProperty(value = "项目标题") @ColumnWidth(40) String title; @ExcelProperty("项目简介") @ColumnWidth(60) String projectDescription; //获取项目starts @ExcelProperty("star") @ColumnWidth(10) Double star; //获取项目分支数量 @ExcelProperty("Fork") @ColumnWidth(10) Double fork; @ExcelProperty("项目图片") String imageSrc; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getImageSrc() { return imageSrc; } public void setImageSrc(String imageSrc) { this.imageSrc = imageSrc; } public String getProjectDescription() { return projectDescription; } public void setProjectDescription(String projectDescription) { this.projectDescription = projectDescription; } public String getProjectLabels() { return projectLabels; } public void setProjectLabels(String projectLabels) { this.projectLabels = projectLabels; } public Double getStar() { return star; } public void setStar(Double star) { this.star = star; } public Double getFork() { return fork; } public void setFork(Double fork) { this.fork = fork; } @Override public int compareTo(Model o) { if(o.star>this.star){ return 1; }else { return -1; } } }
爬取网页,获取各元素的值
package com.example.demo.util; import com.alibaba.excel.EasyExcel; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Objects; public class JsoupTestGitee { public static void main(String[] args) throws IOException { // 获取目标网页 https://gitee.com/gvp/all String url = "https://gitee.com/gvp/all"; //设置浏览器头部信息 String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"; //获取网页DOM Document document = Jsoup.connect(url).userAgent(userAgent).get(); //获取所有项目子元素 String parentClass = "ui fluid card project-card categorical-project-card"; Elements elementsByClass = document.getElementsByClass(parentClass); List<Model> res = new ArrayList<>(); //遍历元素 elementsByClass.forEach((tem) -> { Model mod = new Model(); //获取项目名称 String title = "project-name linkable"; String titleText = tem.getElementsByClass(title).eq(0).text(); mod.setTitle(titleText); //获取项目图标地址 String avatar = "ui avatar"; String src = tem.getElementsByClass(avatar).eq(0).attr("src"); mod.setImageSrc(src); //获取项目简介 String description = "project-description"; String descriptionText = tem.getElementsByClass(description).eq(0).text(); mod.setProjectDescription(descriptionText); //获取项目语言 String projectLabels = "project-labels"; String projectLabelsText = tem.getElementsByClass(projectLabels).eq(0).text(); mod.setProjectLabels(projectLabelsText); //获取项目starts String starText = tem.getElementsByClass("linkable meta").eq(0).text(); mod.setStar(toDouble(starText)); //获取项目分支数量 String forkText = tem.getElementsByClass("linkable meta").eq(1).text(); mod.setFork(toDouble(forkText)); res.add(mod); }); //生成Excel String fileName = System.currentTimeMillis() + ".xlsx"; //获取的数据按照star排序 Collections.sort(res); //将数据写入到excel中 EasyExcel.write(fileName, Model.class).sheet("项目").doWrite(res); } /** * 将1.1K转换为11000 * @param parm * @return */ private static Double toDouble(String parm) { Double res=0.0; if (parm.contains("K")) { int length = parm.length(); parm = String.valueOf(Double.valueOf(parm.substring(0, length - 1)) * 1000); } if (parm != null && !Objects.equals(parm, "")) { res = Double.valueOf(parm); } return res; } }
三、运行程序
就是它lengleng / pig