一、运行环境
pycharm 2020 社区版
python 3.7
beautifulsoup4 4.11.1
二、实际代码
import os import re import requests from bs4 import BeautifulSoup from access.sprider.SpriderAccess import SpriderAccess from base.BaseConfig import BaseConfig from base.BaseFrame import BaseFrame from object.entity.SpriderEntity import SpriderEntity from plugin.Tools import Tools class Netbian: page_count = 1 # 每个栏目开始业务 base_url = "http://pic.netbian.com" # 采集的网址 http://pic.netbian.com/e/search/result/?searchid=2543 save_path = BaseConfig().CORPUS_ROOT + os.sep + "Netbian" # "/Users/zhangyu/Pictures/Wallpaper/" second_url = ("4kmeinv","4kfengjing", "4kbeijing", "4kyouxi") def __init__(self): pass def sprider_wall_paper(self): BaseFrame.__log__("开始采集首页彼岸图网网站的图片...") for column in self.second_url: url = self.base_url + "/" + column + "/" response = requests.get(url) response.encoding = 'gbk' soup = BeautifulSoup(response.text, "html5lib") try: page_end_url = soup.find('span', attrs={"class": 'slh'}).find_next_siblings() page_end_num = page_end_url[0].text while self.page_count <= int(page_end_num): # 翻完停止 if self.page_count == 1: # 栏目的第一个地址 list_url = soup.find_all('a', attrs={"target": '_blank'}) regx = "tupian/\d{1,5}\.html" images_url = re.findall(regx, str(list_url)) for iurl in images_url: image_full_url = self.base_url + "/" + iurl response = requests.get(image_full_url) response.encoding = 'gbk' soup = BeautifulSoup(response.text, "html5lib") for image_obj in soup.find('a', attrs={"id": 'img'}).children: pic_url = self.base_url + image_obj.get("src") pic_title = image_obj.get("alt") BaseFrame.__log__("采集" + pic_title + "的图片..." + url) mmEntity = SpriderEntity() # 依据图片执行,下载过的图片不再下载 mmEntity.sprider_base_url = self.base_url mmEntity.create_datetime = Tools.get_current_datetime() mmEntity.sprider_url = url mmEntity.sprider_pic_title = pic_title mmEntity.sprider_pic_index = str(self.page_count) if SpriderAccess().query_sprider_entity_by_urlandtitle(pic_url, pic_title) is None: SpriderAccess().save_sprider(mmEntity) self.down_pic(pic_url, pic_title, column) else: BaseFrame.__log__("下载过已经跳过。") else: next_pager_url = self.base_url + "/" + column + "/index_" + str(self.page_count) + ".html" try: response = requests.get(next_pager_url) response.encoding = 'gbk' soup = BeautifulSoup(response.text, "html5lib") list_url = soup.find_all('a', attrs={"target": '_blank'}) regx = "tupian/\d{1,5}\.html" images_url = re.findall(regx, str(list_url)) for iurl in images_url: image_full_url = self.base_url + "/" + iurl response = requests.get(image_full_url) response.encoding = 'gbk' soup = BeautifulSoup(response.text, "html5lib") for image_obj in soup.find('a', attrs={"id": 'img'}).children: pic_url = self.base_url + image_obj.get("src") pic_title = image_obj.get("alt") BaseFrame.__log__("采集" + pic_title + "的图片..." + url) mmEntity = SpriderEntity() # 依据图片执行,下载过的图片不再下载 mmEntity.sprider_base_url = self.base_url mmEntity.create_datetime = Tools.get_current_datetime() mmEntity.sprider_url = url mmEntity.sprider_pic_title = pic_title mmEntity.sprider_pic_index = str(self.page_count) if SpriderAccess().query_sprider_entity_by_urlandtitle(pic_url, pic_title) is None: SpriderAccess().save_sprider(mmEntity) self.down_pic(pic_url, pic_title, column) else: BaseFrame.__log__("下载过已经跳过。") except Exception as e: BaseFrame.__log__("请求站点过程发生错误..." + e) continue pass self.page_count = self.page_count + 1 # 翻页要不一致第一页 except Exception as e: BaseFrame.__err__(str(e)) continue pass # region 下载图片 def down_pic(self, pic_url, pic_title, second_path): try: headers = {"Referer": pic_url, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko)Chrome/62.0.3202.94 Safari/537.36'} content = requests.get(pic_url, headers=headers) real_path = self.save_path + os.sep + second_path + os.sep if (os.path.exists(real_path) is False): os.makedirs(real_path) if content.status_code == 200: pic_cun = real_path + pic_title + '.jpg' fp = open(pic_cun, 'wb') fp.write(content.content) fp.close() except Exception as e: BaseFrame.__err__("下载图片出现错误" + str(e)) pass # endregion if __name__ == '__main__': Netbian().sprider_wall_paper() pass