python爬虫示例，获取主页面链接，次级页面链接通过主页面元素获取从而避免js生成变动的值，保存数据分批次避免数据丢失-阿里云开发者社区

# -*- coding: utf-8 -*-# import scrapyimportpandasaspdfrommathimportceilimportreimportrequestsimportrefrombs4importBeautifulSoupfromopenpyxlimportWorkbookfromopenpyxlimportload_workbook# from cve_details.items import CveDetailsItem# class CveDetailSpider(scrapy.Spider):#     name = 'cve_detail'#     allowed_domains = ['https://www.cvedetails.com']#     start_urls = [#         "https://www.cvedetails.com/vulnerability-list/year-" + str(i) + "/vulnerabilities.html" for i in range(1999, 2021)#     ]##     def get_url(self, page, year, trc):#         return "https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year={}&month=0&cweid=0&order=1&trc={}&sha=ef7bb39664f094781e7b403da0e482830f5837d6".format \#             (page, year, trc)##     def parse(self, response):#         # 得到页数，生成url#         nums = response.selector.xpath('//div[@id="pagingb"]/b/text()').get()   # 获取cve的数量#         pages = ceil(int(nums ) /50)                                              # 算出页数#         for year in range(1999, 2021):#             for page in range(1, page s +1):#                 newurl = self.get_url(str(page), str(year), str(nums))#                 yield scrapy.Request(url=newurl, callback=self.parse1, dont_filter=True)##     def parse1(self, response):#         detailurls = response.selector.xpath \#             ('//div[@id="searchresults"]/table/tr[@class="srrowns"]/td[@nowrap]/a/@href').getall()#         for detailurl in detailurls:#             durl = "https://www.cvedetails.com" + detailurl#             yield scrapy.Request(url=durl, callback=self.parse2, dont_filter=True)##     def parse2(self, response):#         # CVE编号，危害等级，漏洞类型，供应商，型号，设备类型，固件版本号#         cveid = response.selector.xpath('//h1/a/text()').get()#         score = response.selector.xpath('//div[@class="cvssbox"]/text()').get()#         if score == '0.0':#             return None#         vulntype =  re.findall(r'">(.*?)</span>', response.selector.xpath('//table[@id="cvssscorestable"]/tr').getall()[-2])#         vulntype = '' if vulntype == [] else vulntype[0]#         makes = response.selector.xpath('//table[@id="vulnprodstable"]/tr').getall()[1:]#         rule1 = re.compile(r'<a .*>(.*)</a>')#         rule2 = re.compile(r'<td>\s+(.*?)\s+</td>')#         for make in makes:#             vendor ,product ,_ = rule1.findall(make)#             producttype ,_ ,_ ,version ,_ ,_ ,_ ,_ = rule2.findall(make)#             item = CveDetailsItem()#             item['cveid'] ,item['score'] ,item['vulntype'] ,item['vendor'] ,item['product'] ,item['producttype'] ,item#                 ['version'] = cveid ,score ,vulntype ,vendor ,product ,producttype ,version#             yield item#             # print(cveid,score,vulntype,vendor,product,producttype,version)classCveDetailSpider2():
result_num=0name='cve_detail'allowed_domains= ['https://www.cvedetails.com']
start_urls= [
"https://www.cvedetails.com/vulnerability-list/year-"+str(i) +"/vulnerabilities.html"foriinrange(1999,2000)
    ]
headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
' AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/58.0.3029.110 Safari/537.3'}
defget_url(self, page, year, trc):
return"https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year={}&month=0&cweid=0&order=1&trc={}&sha=b87d72f681722fd5f26c1153b2202a4f05acfff1".format(
page, year, trc)
defparse(self,start_y=1999,end_y=2000):
foryearinrange(start_y, end_y+1):
response=requests.get("https://www.cvedetails.com/vulnerability-list/year-"+str(year) +"/vulnerabilities.html", headers=self.headers)
soup=BeautifulSoup(response.content, 'html.parser')
# 得到页数，生成urlrows=soup.find_all('div', {'id': 'pagingb'})
# nums=rows[0].find_all('b')# nums=nums[0].text# print(nums)# pages = ceil(int(nums) / 50)  # 算出页数# for page in range(1, pages + 1):#     newurl = self.get_url(str(page), str(year), str(nums))#     print(newurl)forrowinrows[0].find_all('a',href=True):
newurl="https://www.cvedetails.com"+row['href']
# print(newurl)tag=self.parse1(newurl,year)
iftag==0:
print('continue!---',year)
continueiftag==1:
print('break!---',year)
breakdefparse1(self, url,year):
response=requests.get(url, headers=self.headers)
# # 基于 BeautifulSoup 解析数据soup=BeautifulSoup(response.content, 'html.parser')
# # 获取所有的表格行rows=soup.find_all('tr', {'class': 'srrowns'})
self.result_num+=len(rows)
#跳过前面多少记录ifself.result_num<6000:
print("跳过 result_num:%d"%self.result_num)
return0#到多少条停止ifself.result_num>10000:
print("停止 result_num:%d"%self.result_num)
self.result_num=0return1print('self.result_num:',self.result_num)
forrowinrows:
durl="https://www.cvedetails.com"+row.find_all('a', href=True)[0]['href']
print(durl)
ifdurl.split('/')[-2].split('-')[1]!=str(year):
print('{0}年数据已完成！！跳到下一年'.format(year))
self.result_num=0return1ifdurlnotin ['https://www.cvedetails.com/cve/CVE-2010-4609/',
'https://www.cvedetails.com/cve/CVE-2010-2812/',
'https://www.cvedetails.com/cve/CVE-2014-0629/',
"https://www.cvedetails.com/cve/CVE-2017-14064/",
"https://www.cvedetails.com/cve/CVE-2017-11027/",
"https://www.cvedetails.com/cve/CVE-2017-8760/"                                ]:
self.parse2(durl)
# breakdefparse2(self, path2):
response=requests.get(path2, headers=self.headers)
soup=BeautifulSoup(response.content, 'html.parser')
rows=soup.find_all('h1')
cveid=""iflen(rows)>0:
print(rows[0].find_all('a', href=True)[0].text)
cveid=rows[0].find_all('a', href=True)[0].textifcveidinexist_cv1:
print('{0}已存在，跳过！！'.format(cveid))
returnrows=soup.find_all('div', {'class': 'cvedetailssummary'})
describe=''iflen(rows) >0:
describe_str=rows[0].textdescribe_str=describe_str.strip()
describe=describe_str.split('\t')[0]
# print(describe)score=''iflen(rows) >0:
rows=soup.find_all('div', {'class': 'cvssbox'})
iflen(rows) >0:
print(rows[0].text)
score=rows[0].textvulntype=""iflen(rows) >0:
rows=soup.find_all('span', {'class': 'vt_dos'})
iflen(rows) >0:
print(rows[0].text)
vulntype=rows[0].textproducttype= []
vendor= []
product= []
version= []
rows_table=soup.find_all('table', {'class': 'listtable', 'id': 'vulnprodstable'})
# try:iflen(rows_table) >0:
rows_tr=rows_table[0].find_all('tr')
tr_num=len(rows_tr)
iftr_num>1:
foriinrange(1,tr_num):
rows_td=rows_tr[i].find_all('td')
iflen(rows_td) >1:
# print(rows_td)producttype.append(rows_td[1].text.strip())
vendor.append(rows_td[2].text.strip())
product.append(rows_td[3].text.strip())
version.append(rows_td[4].text.strip())
# product = rows[1].text.strip()item= {}
item['cveid'],item['describe'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version'] =cveid, describe,score, vulntype, "|".join(set(producttype)), "|".join(set(vendor)), "|".join(set(product)), "|".join(set(version))
print(item)
filename='output(2015).xlsx'try:
workbook=load_workbook(filename)
worksheet=workbook.activeexceptFileNotFoundError:
workbook=Workbook()
worksheet=workbook.activeheader= ['cveid', 'describe','score', 'vulntype', 'producttype', 'vendor', 'product', 'version']
worksheet.append(header)
finally:
values= [item['cveid'],item['describe'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version']]
worksheet.append(values)
workbook.save(filename)
workbook.close()
defget_cve_data(start_page: int, num_records: int) ->None:
# 设置 url 和 headersurl=f'https://www.cvedetails.com/vulnerability-list.php?' \
f'vendor_id=0&product_id=0&version_id=0&page={start_page}' \
f'&numrows={num_records}&hasexp=0&opdos=0&opec=0&opov=0' \
f'&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0' \
f'&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0' \
f'&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=0&sha=' \
f'8a181058fa3202146b2bbf6c9a982505c6d25cc3'headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
' AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/58.0.3029.110 Safari/537.3'}
# 发送网络请求response=requests.get(url, headers=headers)
# # 基于 BeautifulSoup 解析数据soup=BeautifulSoup(response.content, 'html.parser')
# # 获取所有的表格行rows=soup.find_all('tr', {'class': 'srrowns'})
forrowinrows:
# print('-------------------')# print(row.find_all('a',href=True)[0])# print(row.find_all('a', href=True)[0]['href'])# print(row.find_all('a',href=True)[0].text)path2=row.find_all('a', href=True)[0]['href']
# for detailurl in detailurls:path2="https://www.cvedetails.com"+path2path2='https://www.cvedetails.com/cve/CVE-1999-1567/'print(path2)
response=requests.get(path2, headers=headers)
soup=BeautifulSoup(response.content, 'html.parser')
rows=soup.find_all('h1')
print(rows[0].find_all('a',href=True)[0].text)
cveid=rows[0].find_all('a',href=True)[0].textrows=soup.find_all('div',{'class': 'cvssbox'})
print(rows[0].text)
score=rows[0].textrows=soup.find_all('span', {'class': 'vt_dos'})
print(rows[0].text)
vulntype=rows[0].textrows=soup.find_all('table', {'class': 'listtable','id':'vulnprodstable'})
rows=rows[0].find_all('td')
print(rows[1].text.strip())
producttype=rows[1].text.strip()
vendor=rows[2].text.strip()
product=rows[3].text.strip()
version=rows[4].text.strip()
# product = rows[1].text.strip()item={}
item['cveid'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version'] =cveid, score, vulntype,producttype, vendor, product, versionprint(item)
if__name__=='__main__':
# 爬取数据的起始页码和返回的数据条数# start_page = 1# num_records = 10# get_cve_data(start_page, num_records)d1=pd.read_excel('all_spyder(中文).xlsx')
exist_cv1=d1['cve'].tolist()
d1=pd.read_excel('output(2015).xlsx')
exist_cv2=d1['cveid'].tolist()
exist_cv1.extend(exist_cv2)
exist_cv1=list(set(exist_cv1))
print(exist_cv1[:10])
crawler_tool=CveDetailSpider2()
crawler_tool.parse(start_y=2017,end_y=2017)
python爬虫示例，获取主页面链接，次级页面链接通过主页面元素获取从而避免js生成变动的值，保存数据分批次避免数据丢失

热门文章

最新文章

相关课程

相关电子书

相关实验场景