采集中国船舶网数据,使用以下几个核心包:
requests:2.10.0
beautifulsoup4:4.7.1
openpyxl :2.6.2
其中:
1、BaseFrame.__log__("开始采集中国船舶网的数据...") BaseFrame.__log__() 就是我个人封装的日之类替换成print就可以。
2、response = requests.get(self.base_url, timeout=30, headers=UserAgent().get_random_header(self.base_url)) 这个就是封装的一个随机header,防止被认为是机器,每次都模拟一个新的浏览器的header。代码是这个:UserAgent().get_random_header(self.base_url)
3、filepath = BaseConfig().CORPUS_ROOT + os.sep + "equipment_info.xlsx" 这个就是文件的路径BaseConfig().CORPUS_ROOT替换成自己的文件路径就可以了。
4、mmEntity = SpriderEntity() 这个就是一个实体类用来记录采集过的数据,程序可以多次执行。防止数据重复采集的策略。
不废话了直接上代码:
import os import requests from bs4 import BeautifulSoup from openpyxl import load_workbook from access.sprider.SpriderAccess import SpriderAccess from base.BaseConfig import BaseConfig from base.BaseFrame import BaseFrame from business.sprider.UserAgent import UserAgent from object.entity.SpriderEntity import SpriderEntity from plugin.Tools import Tools class CnShipNet: base_url = "http://www.cnshipnet.com/equipment/" # 采集的网址 page_count = 1 # 每个栏目开始业务 def __init__(self): pass def _sprider_equipment_(self): BaseFrame.__log__("开始采集中国船舶网的数据...") response = requests.get(self.base_url, timeout=30, headers=UserAgent().get_random_header(self.base_url)) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html5lib") list_dl = soup.find_all('dl', attrs={"class": 'sub_list'}) for dd in list_dl: for a in dd.find("dd").find_all("a"): url = a.get('href') # 大分类的第一页 sell_list_2.html response = requests.get(url, timeout=30, headers=UserAgent().get_random_header(url)) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html5lib") if soup.find("cite") is not None: page_end_number = str(soup.find("cite").text).split("/")[1].replace("页", "") while self.page_count <= int(page_end_number): # 翻完停止 if self.page_count == 1: # 栏目的第一个地址 self.get_content(soup) # 第一页 else: next_url = url + "/sell_list_" + str(self.page_count) + ".html" response = requests.get(next_url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html5lib") self.get_content(soup) self.page_count = self.page_count + 1 else: self.get_content(soup) # 只有1页 def get_content(self, soup): for a in soup.find_all("a", attrs={"class": 'comtitle'}): url = a.get('href') # 明细的URL title = a.get('title') company = "" models = "" mmEntity = SpriderEntity() mmEntity.sprider_base_url = self.base_url mmEntity.create_datetime = Tools.get_current_datetime() mmEntity.sprider_url = url mmEntity.sprider_pic_title = title mmEntity.sprider_pic_index = str(1) if SpriderAccess().query_sprider_entity_by_urlandtitle(url, title) is None: try: response = requests.get(url, timeout=30, headers=UserAgent().get_random_header(url)) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html5lib") BaseFrame.__log__("采集" + title + "的信息..." + url) content = soup.find("div", attrs={"id": 'content'}).text.strip() ul = soup.find("ul", attrs={"class": 'fl'}) for li in ul.find_all("li"): span = li.find("span", attrs={"class": 'black'}) if span is not None: if span.text == "品牌:": company = li.contents[1] if span.text == "型号:": models = li.contents[1] except Exception as e: BaseFrame.__err__("采集信息失败请重试" + str(e)) #BaseFrame.__log__("准备数据库信息") try: company = company.replace("'", "") filepath = BaseConfig().CORPUS_ROOT + os.sep + "equipment_info.xlsx" xl = load_workbook(filepath) # 获取所有sheet页名字 xl_sheet_names = xl.get_sheet_names() # 定位到相应sheet页,[0]为sheet页索引 xl_sheet = xl.get_sheet_by_name(xl_sheet_names[0]) # 获取行数 row = xl_sheet.max_row # 单元格赋值 xl_sheet.cell(row=row + 1, column=1, value=title) xl_sheet.cell(row=row + 1, column=2, value=models) xl_sheet.cell(row=row + 1, column=3, value=company) xl_sheet.cell(row=row + 1, column=4, value=content) # 保存excel xl.save(filepath) SpriderAccess().save_sprider(mmEntity) except Exception as e: BaseFrame.__log__("保存Execl失败...跳过本次存储" + str(e)) else: BaseFrame.__log__(title + "数据采集过已经跳过...") pass pass if __name__ == '__main__': CnShipNet()._sprider_equipment_()
写的不好还望见谅。。。来个截图欣赏一下: