正文
四、webmagic
介绍
webmagic的是参考业界最优秀爬虫Scrapy来实现的,使用了HttpClient、Jsoup等Java世界最成熟的工具
架构
WebMagic的结构分为Downloader(下载)、PageProcessor(处理)、Scheduler(管理)、Pipeline(持久化)四个组件,并由Spider(容器)将它们彼此组织起来,可以互相交互、流程化的执行,总体架构图如下
组件
Downloader
负责从网络上下载页面,以便后续处理,webmagic默认使用httpclient
PageProcessor
负责解析页面,抽取有用信息,以及发现新的链接,使用Jsoup来解析HTML
Scheduler
负责管理待抓取URL,以及一些去重工作。webmagic默认使用JDK自带的内存队列来管理URL,用集合去重,支持redis分布式管理
Pipeline
负责抽取结果的处理,包括计算、持久化到文件、数据库等
XSoup
基于Jsoup开发的一款XPath解析器
五、微服务集成
数据库表设计
-- ---------------------------- -- Table structure for boot_link -- ---------------------------- DROP TABLE IF EXISTS `boot_link`; CREATE TABLE `boot_link` ( `id` bigint(20) NOT NULL COMMENT 'id', `uri` varchar(400) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '文章链接', `type` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '网站类型', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic; INSERT INTO `boot_link` VALUES ('11', 'https://www.cnblogs.com/koushenhai/p/12595630.html', 'bky'); INSERT INTO `boot_link` VALUES ('12', 'https://kcloud.blog.csdn.net/article/details/118633942', 'csdn'); INSERT INTO `boot_link` VALUES ('20', 'https://kcloud.blog.csdn.net/article/details/121491124', 'csdn'); INSERT INTO `boot_link` VALUES ('33', 'https://kcloud.blog.csdn.net/article/details/82109656', 'csdn'); INSERT INTO `boot_link` VALUES ('41', 'https://kcloud.blog.csdn.net/article/details/117769662', 'csdn'); INSERT INTO `boot_link` VALUES ('49', 'https://kcloud.blog.csdn.net/article/details/118660073', 'csdn'); INSERT INTO `boot_link` VALUES ('57', 'https://kcloud.blog.csdn.net/article/details/119720174', 'csdn'); INSERT INTO `boot_link` VALUES ('65', 'https://kcloud.blog.csdn.net/article/details/123179670', 'csdn'); INSERT INTO `boot_link` VALUES ('66', 'https://kcloud.blog.csdn.net/article/details/117635759', 'csdn'); INSERT INTO `boot_link` VALUES ('74', 'https://kcloud.blog.csdn.net/article/details/117771583', 'csdn'); INSERT INTO `boot_link` VALUES ('78', 'https://kcloud.blog.csdn.net/article/details/123039609', 'csdn'); INSERT INTO `boot_link` VALUES ('79', 'https://kcloud.blog.csdn.net/article/details/82588914', 'csdn'); INSERT INTO `boot_link` VALUES ('96', 'https://kcloud.blog.csdn.net/article/details/108021143', 'csdn'); INSERT INTO `boot_link` VALUES ('118', 'https://kcloud.blog.csdn.net/article/details/121305244', 'csdn'); INSERT INTO `boot_link` VALUES ('128', 'https://kcloud.blog.csdn.net/article/details/82110125', 'csdn'); INSERT INTO `boot_link` VALUES ('129', 'https://kcloud.blog.csdn.net/article/details/123630814', 'csdn'); INSERT INTO `boot_link` VALUES ('130', 'https://kcloud.blog.csdn.net/article/details/116420798', 'csdn'); INSERT INTO `boot_link` VALUES ('131', 'https://kcloud.blog.csdn.net/article/details/123484520', 'csdn'); INSERT INTO `boot_link` VALUES ('132', 'https://kcloud.blog.csdn.net/article/details/123013305', 'csdn'); INSERT INTO `boot_link` VALUES ('133', 'https://kcloud.blog.csdn.net/article/details/123390833', 'csdn'); INSERT INTO `boot_link` VALUES ('134', 'https://kcloud.blog.csdn.net/article/details/123311487', 'csdn'); INSERT INTO `boot_link` VALUES ('135', 'https://kcloud.blog.csdn.net/article/details/123292276', 'csdn'); INSERT INTO `boot_link` VALUES ('136', 'https://kcloud.blog.csdn.net/article/details/123123229', 'csdn'); INSERT INTO `boot_link` VALUES ('137', 'https://kcloud.blog.csdn.net/article/details/116704223', 'csdn'); INSERT INTO `boot_link` VALUES ('145', 'https://kcloud.blog.csdn.net/article/details/123739314', 'csdn'); INSERT INTO `boot_link` VALUES ('146', 'https://kcloud.blog.csdn.net/article/details/123688809', 'csdn'); INSERT INTO `boot_link` VALUES ('147', 'https://kcloud.blog.csdn.net/article/details/123673741', 'csdn'); INSERT INTO `boot_link` VALUES ('148', 'https://kcloud.blog.csdn.net/article/details/123628721', 'csdn'); INSERT INTO `boot_link` VALUES ('149', 'https://kcloud.blog.csdn.net/article/details/123599384', 'csdn'); INSERT INTO `boot_link` VALUES ('150', 'https://kcloud.blog.csdn.net/article/details/122181814', 'csdn'); INSERT INTO `boot_link` VALUES ('151', 'https://kcloud.blog.csdn.net/article/details/121557788', 'csdn'); INSERT INTO `boot_link` VALUES ('159', 'https://kcloud.blog.csdn.net/article/details/116449621', 'csdn'); INSERT INTO `boot_link` VALUES ('160', 'https://kcloud.blog.csdn.net/article/details/83623118', 'csdn'); INSERT INTO `boot_link` VALUES ('161', 'https://kcloud.blog.csdn.net/article/details/84777724', 'csdn'); INSERT INTO `boot_link` VALUES ('162', 'https://kcloud.blog.csdn.net/article/details/105587614', 'csdn'); INSERT INTO `boot_link` VALUES ('163', 'https://kcloud.blog.csdn.net/article/details/83515122', 'csdn'); INSERT INTO `boot_link` VALUES ('164', 'https://kcloud.blog.csdn.net/article/details/83451040', 'csdn'); INSERT INTO `boot_link` VALUES ('165', 'https://kcloud.blog.csdn.net/article/details/117252826', 'csdn'); INSERT INTO `boot_link` VALUES ('166', 'https://kcloud.blog.csdn.net/article/details/84826176', 'csdn'); INSERT INTO `boot_link` VALUES ('167', 'https://kcloud.blog.csdn.net/article/details/120031600', 'csdn'); INSERT INTO `boot_link` VALUES ('168', 'https://kcloud.blog.csdn.net/article/details/119685953', 'csdn'); INSERT INTO `boot_link` VALUES ('169', 'https://kcloud.blog.csdn.net/article/details/120147123', 'csdn'); INSERT INTO `boot_link` VALUES ('170', 'https://kcloud.blog.csdn.net/article/details/120245035', 'csdn'); INSERT INTO `boot_link` VALUES ('171', 'https://kcloud.blog.csdn.net/article/details/120190383', 'csdn'); INSERT INTO `boot_link` VALUES ('179', 'https://kcloud.blog.csdn.net/article/details/94590629', 'csdn'); INSERT INTO `boot_link` VALUES ('187', 'https://kcloud.blog.csdn.net/article/details/116949872', 'csdn'); INSERT INTO `boot_link` VALUES ('192', 'https://kcloud.blog.csdn.net/article/details/123789292', 'csdn'); INSERT INTO `boot_link` VALUES ('193', 'https://kcloud.blog.csdn.net/article/details/123780832', 'csdn'); INSERT INTO `boot_link` VALUES ('194', 'https://kcloud.blog.csdn.net/article/details/123771040', 'csdn'); INSERT INTO `boot_link` VALUES ('195', 'https://kcloud.blog.csdn.net/article/details/122522290', 'csdn'); INSERT INTO `boot_link` VALUES ('196', 'https://kcloud.blog.csdn.net/article/details/123833614', 'csdn'); DROP TABLE IF EXISTS `boot_article`; CREATE TABLE `boot_article` ( `id` bigint(20) NOT NULL COMMENT 'id', `title` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '文章链接', `content` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '网站类型', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
微服务
引入依赖
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> <exclusion> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-logging</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </exclusion> <exclusion> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-logging</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>com.esotericsoftware</groupId> <artifactId>reflectasm</artifactId> <version>1.11.7</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-aspects</artifactId> </dependency>
代码架构
核心代码(以采集csdn为例)
创建CsdnArticleSpider类
/** * 爬虫默认实现 * @author Kou Shenhai * @version 1.0 * @date 2020/11/15 0015 下午 4:40 */ @Configuration @Slf4j public class CsdnArticleSpider implements PageProcessor { private ProcessStrategy processStrategy; private static final int SLEEP_TIME = 3000; private static final int TIMEOUT = 3000; private static final int RETRY_TIMES = 10; private static final int RETRY_SLEEP_TIME = 3000; private static final String CHARSET = "utf-8"; private static final String DOMAIN = "csdn.net"; private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"; /** * * @param site 抓取网站的相关配置,包括编码、重试次数、抓取间隔 */ private Site site = Site .me() .setRetryTimes(RETRY_TIMES) .setRetrySleepTime(RETRY_SLEEP_TIME) .setDomain(DOMAIN) .setSleepTime(SLEEP_TIME) .setTimeOut(TIMEOUT) .setCharset(CHARSET) .setUserAgent(USER_AGENT) .addHeader("Cookie",""); public void setProcessStrategy(ProcessStrategy processStrategy) { this.processStrategy = processStrategy; } /** * * @param page process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 */ @Override public void process(Page page) { if (processStrategy == null) { throw new NullPointerException(); } /** * 开始 */ preProcess(page); //策略模式 processStrategy.process(page); /** * 结束 */ afterProcess(page); } @Override public Site getSite() { return site; } public Spider getSpider() { return Spider.create(this); } /** * 下面两个方法用于扩展自定义的process方法,比如加入迭代url等等,主要逻辑放在processStategy */ protected void preProcess(Page page) { log.info("开始爬取..."); } protected void afterProcess(Page page) { log.info("完成爬取..."); } }
创建CsdnArticleHandler
/** * @author Kou Shenhai */ @Component public class CsdnArticleHandler extends AbstractArticleHandler{ @Autowired private CsdnArticleSpider csdnArticleSpider; @Autowired private ArticlePipeline articlePipeline; @Override protected ArticleTypeEnum getArticleTypeEnum() { return ArticleTypeEnum.CSDN; } @Autowired private PipelineObserver pipelineObserver; @Override @Async protected void articlePull(String[] uris) { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); articlePipeline.addObserver(pipelineObserver); csdnArticleSpider.setProcessStrategy(new IteratorProcess(new CsdnArticleProcess())); csdnArticleSpider.getSpider().addUrl(uris) .setDownloader(httpClientDownloader) // 开启线程抓取 .thread(2 * Runtime.getRuntime().availableProcessors()) .addPipeline(articlePipeline) //启动爬虫 .start(); } }
创建ArticlePipeline
public class ArticlePipeline implements CallablePipeline{ private Vector<Observer> obs; public ArticlePipeline() { obs = new Vector<>(1); } @Override public void process(ResultItems resultItems, Task task) { notifyObservers(resultItems.getAll()); } @Override public synchronized void addObserver(Observer o) { if (o == null) { throw new NullPointerException(); } if (!obs.contains(o)) { obs.addElement(o); } } @Override public synchronized void notifyObservers(Object arg) { Object[] arrLocal; synchronized (this) { arrLocal = obs.toArray(); } for (int i = arrLocal.length - 1; i >= 0; i--) { ((Observer)arrLocal[i]).update(this, arg); } } @Override public synchronized void deleteObserver(Observer o) { obs.removeElement(o); } }
创建CsdnArticleProcess
/** * * @author Kou Shenhai * @version 1.0 * @date 2021/4/24 0024 下午 4:05 */ public class CsdnArticleProcess implements ProcessStrategy{ @Override public void process(Page page) { String content = page.getHtml().xpath("//*[@id='mainBox']/main/div[1]/article").get(); String title = page.getHtml().xpath("//*[@id='articleContentId']/text()").get(); page.putField("content",content); page.putField("title",title); } }
六、测试
参考教程:菜鸟教程-设计模式
参考教程:webmagic文档
本项目仅作为技术学习研究使用,禁止用于任何商业用途,禁止任何损害网站利益的行为
本项目仅作为技术学习研究使用,禁止用于任何商业用途,禁止任何损害网站利益的行为
本项目仅作为技术学习研究使用,禁止用于任何商业用途,禁止任何损害网站利益的行为