python爬虫示例,获取主页面链接,次级页面链接通过主页面元素获取从而避免js生成变动的值,保存数据分批次避免数据丢失

简介: python爬虫示例,获取主页面链接,次级页面链接通过主页面元素获取从而避免js生成变动的值,保存数据分批次避免数据丢失
# -*- coding: utf-8 -*-# import scrapyimportpandasaspdfrommathimportceilimportreimportrequestsimportrefrombs4importBeautifulSoupfromopenpyxlimportWorkbookfromopenpyxlimportload_workbook# from cve_details.items import CveDetailsItem# class CveDetailSpider(scrapy.Spider):#     name = 'cve_detail'#     allowed_domains = ['https://www.cvedetails.com']#     start_urls = [#         "https://www.cvedetails.com/vulnerability-list/year-" + str(i) + "/vulnerabilities.html" for i in range(1999, 2021)#     ]##     def get_url(self, page, year, trc):#         return "https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year={}&month=0&cweid=0&order=1&trc={}&sha=ef7bb39664f094781e7b403da0e482830f5837d6".format \#             (page, year, trc)##     def parse(self, response):#         # 得到页数,生成url#         nums = response.selector.xpath('//div[@id="pagingb"]/b/text()').get()   # 获取cve的数量#         pages = ceil(int(nums ) /50)                                              # 算出页数#         for year in range(1999, 2021):#             for page in range(1, page s +1):#                 newurl = self.get_url(str(page), str(year), str(nums))#                 yield scrapy.Request(url=newurl, callback=self.parse1, dont_filter=True)##     def parse1(self, response):#         detailurls = response.selector.xpath \#             ('//div[@id="searchresults"]/table/tr[@class="srrowns"]/td[@nowrap]/a/@href').getall()#         for detailurl in detailurls:#             durl = "https://www.cvedetails.com" + detailurl#             yield scrapy.Request(url=durl, callback=self.parse2, dont_filter=True)##     def parse2(self, response):#         # CVE编号,危害等级,漏洞类型,供应商,型号,设备类型,固件版本号#         cveid = response.selector.xpath('//h1/a/text()').get()#         score = response.selector.xpath('//div[@class="cvssbox"]/text()').get()#         if score == '0.0':#             return None#         vulntype =  re.findall(r'">(.*?)</span>', response.selector.xpath('//table[@id="cvssscorestable"]/tr').getall()[-2])#         vulntype = '' if vulntype == [] else vulntype[0]#         makes = response.selector.xpath('//table[@id="vulnprodstable"]/tr').getall()[1:]#         rule1 = re.compile(r'<a .*>(.*)</a>')#         rule2 = re.compile(r'<td>\s+(.*?)\s+</td>')#         for make in makes:#             vendor ,product ,_ = rule1.findall(make)#             producttype ,_ ,_ ,version ,_ ,_ ,_ ,_ = rule2.findall(make)#             item = CveDetailsItem()#             item['cveid'] ,item['score'] ,item['vulntype'] ,item['vendor'] ,item['product'] ,item['producttype'] ,item#                 ['version'] = cveid ,score ,vulntype ,vendor ,product ,producttype ,version#             yield item#             # print(cveid,score,vulntype,vendor,product,producttype,version)classCveDetailSpider2():
result_num=0name='cve_detail'allowed_domains= ['https://www.cvedetails.com']
start_urls= [
"https://www.cvedetails.com/vulnerability-list/year-"+str(i) +"/vulnerabilities.html"foriinrange(1999,2000)
    ]
headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
' AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/58.0.3029.110 Safari/537.3'}
defget_url(self, page, year, trc):
return"https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year={}&month=0&cweid=0&order=1&trc={}&sha=b87d72f681722fd5f26c1153b2202a4f05acfff1".format(
page, year, trc)
defparse(self,start_y=1999,end_y=2000):
foryearinrange(start_y, end_y+1):
response=requests.get("https://www.cvedetails.com/vulnerability-list/year-"+str(year) +"/vulnerabilities.html", headers=self.headers)
soup=BeautifulSoup(response.content, 'html.parser')
# 得到页数,生成urlrows=soup.find_all('div', {'id': 'pagingb'})
# nums=rows[0].find_all('b')# nums=nums[0].text# print(nums)# pages = ceil(int(nums) / 50)  # 算出页数# for page in range(1, pages + 1):#     newurl = self.get_url(str(page), str(year), str(nums))#     print(newurl)forrowinrows[0].find_all('a',href=True):
newurl="https://www.cvedetails.com"+row['href']
# print(newurl)tag=self.parse1(newurl,year)
iftag==0:
print('continue!---',year)
continueiftag==1:
print('break!---',year)
breakdefparse1(self, url,year):
response=requests.get(url, headers=self.headers)
# # 基于 BeautifulSoup 解析数据soup=BeautifulSoup(response.content, 'html.parser')
# # 获取所有的表格行rows=soup.find_all('tr', {'class': 'srrowns'})
self.result_num+=len(rows)
#跳过前面多少记录ifself.result_num<6000:
print("跳过 result_num:%d"%self.result_num)
return0#到多少条停止ifself.result_num>10000:
print("停止 result_num:%d"%self.result_num)
self.result_num=0return1print('self.result_num:',self.result_num)
forrowinrows:
durl="https://www.cvedetails.com"+row.find_all('a', href=True)[0]['href']
print(durl)
ifdurl.split('/')[-2].split('-')[1]!=str(year):
print('{0}年数据已完成!!跳到下一年'.format(year))
self.result_num=0return1ifdurlnotin ['https://www.cvedetails.com/cve/CVE-2010-4609/',
'https://www.cvedetails.com/cve/CVE-2010-2812/',
'https://www.cvedetails.com/cve/CVE-2014-0629/',
"https://www.cvedetails.com/cve/CVE-2017-14064/",
"https://www.cvedetails.com/cve/CVE-2017-11027/",
"https://www.cvedetails.com/cve/CVE-2017-8760/"                                ]:
self.parse2(durl)
# breakdefparse2(self, path2):
response=requests.get(path2, headers=self.headers)
soup=BeautifulSoup(response.content, 'html.parser')
rows=soup.find_all('h1')
cveid=""iflen(rows)>0:
print(rows[0].find_all('a', href=True)[0].text)
cveid=rows[0].find_all('a', href=True)[0].textifcveidinexist_cv1:
print('{0}已存在,跳过!!'.format(cveid))
returnrows=soup.find_all('div', {'class': 'cvedetailssummary'})
describe=''iflen(rows) >0:
describe_str=rows[0].textdescribe_str=describe_str.strip()
describe=describe_str.split('\t')[0]
# print(describe)score=''iflen(rows) >0:
rows=soup.find_all('div', {'class': 'cvssbox'})
iflen(rows) >0:
print(rows[0].text)
score=rows[0].textvulntype=""iflen(rows) >0:
rows=soup.find_all('span', {'class': 'vt_dos'})
iflen(rows) >0:
print(rows[0].text)
vulntype=rows[0].textproducttype= []
vendor= []
product= []
version= []
rows_table=soup.find_all('table', {'class': 'listtable', 'id': 'vulnprodstable'})
# try:iflen(rows_table) >0:
rows_tr=rows_table[0].find_all('tr')
tr_num=len(rows_tr)
iftr_num>1:
foriinrange(1,tr_num):
rows_td=rows_tr[i].find_all('td')
iflen(rows_td) >1:
# print(rows_td)producttype.append(rows_td[1].text.strip())
vendor.append(rows_td[2].text.strip())
product.append(rows_td[3].text.strip())
version.append(rows_td[4].text.strip())
# product = rows[1].text.strip()item= {}
item['cveid'],item['describe'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version'] =cveid, describe,score, vulntype, "|".join(set(producttype)), "|".join(set(vendor)), "|".join(set(product)), "|".join(set(version))
print(item)
filename='output(2015).xlsx'try:
workbook=load_workbook(filename)
worksheet=workbook.activeexceptFileNotFoundError:
workbook=Workbook()
worksheet=workbook.activeheader= ['cveid', 'describe','score', 'vulntype', 'producttype', 'vendor', 'product', 'version']
worksheet.append(header)
finally:
values= [item['cveid'],item['describe'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version']]
worksheet.append(values)
workbook.save(filename)
workbook.close()
defget_cve_data(start_page: int, num_records: int) ->None:
# 设置 url 和 headersurl=f'https://www.cvedetails.com/vulnerability-list.php?' \
f'vendor_id=0&product_id=0&version_id=0&page={start_page}' \
f'&numrows={num_records}&hasexp=0&opdos=0&opec=0&opov=0' \
f'&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0' \
f'&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0' \
f'&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=0&sha=' \
f'8a181058fa3202146b2bbf6c9a982505c6d25cc3'headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
' AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/58.0.3029.110 Safari/537.3'}
# 发送网络请求response=requests.get(url, headers=headers)
# # 基于 BeautifulSoup 解析数据soup=BeautifulSoup(response.content, 'html.parser')
# # 获取所有的表格行rows=soup.find_all('tr', {'class': 'srrowns'})
forrowinrows:
# print('-------------------')# print(row.find_all('a',href=True)[0])# print(row.find_all('a', href=True)[0]['href'])# print(row.find_all('a',href=True)[0].text)path2=row.find_all('a', href=True)[0]['href']
# for detailurl in detailurls:path2="https://www.cvedetails.com"+path2path2='https://www.cvedetails.com/cve/CVE-1999-1567/'print(path2)
response=requests.get(path2, headers=headers)
soup=BeautifulSoup(response.content, 'html.parser')
rows=soup.find_all('h1')
print(rows[0].find_all('a',href=True)[0].text)
cveid=rows[0].find_all('a',href=True)[0].textrows=soup.find_all('div',{'class': 'cvssbox'})
print(rows[0].text)
score=rows[0].textrows=soup.find_all('span', {'class': 'vt_dos'})
print(rows[0].text)
vulntype=rows[0].textrows=soup.find_all('table', {'class': 'listtable','id':'vulnprodstable'})
rows=rows[0].find_all('td')
print(rows[1].text.strip())
producttype=rows[1].text.strip()
vendor=rows[2].text.strip()
product=rows[3].text.strip()
version=rows[4].text.strip()
# product = rows[1].text.strip()item={}
item['cveid'], item['score'], item['vulntype'], item['producttype'], item['vendor'], item['product'], item[
'version'] =cveid, score, vulntype,producttype, vendor, product, versionprint(item)
if__name__=='__main__':
# 爬取数据的起始页码和返回的数据条数# start_page = 1# num_records = 10# get_cve_data(start_page, num_records)d1=pd.read_excel('all_spyder(中文).xlsx')
exist_cv1=d1['cve'].tolist()
d1=pd.read_excel('output(2015).xlsx')
exist_cv2=d1['cveid'].tolist()
exist_cv1.extend(exist_cv2)
exist_cv1=list(set(exist_cv1))
print(exist_cv1[:10])
crawler_tool=CveDetailSpider2()
crawler_tool.parse(start_y=2017,end_y=2017)
目录
相关文章
|
2月前
|
Web App开发 数据采集 JavaScript
动态网页爬取:Python如何获取JS加载的数据?
动态网页爬取:Python如何获取JS加载的数据?
447 58
|
2月前
|
数据采集 存储 前端开发
Python爬虫自动化:批量抓取网页中的A链接
Python爬虫自动化:批量抓取网页中的A链接
|
2月前
|
数据采集 Web App开发 JavaScript
Python爬虫如何获取JavaScript动态渲染后的网页内容?
Python爬虫如何获取JavaScript动态渲染后的网页内容?
|
5月前
|
数据采集 JavaScript Android开发
【02】仿站技术之python技术,看完学会再也不用去购买收费工具了-本次找了小影-感觉页面很好看-本次是爬取vue需要用到Puppeteer库用node.js扒一个app下载落地页-包括安卓android下载(简单)-ios苹果plist下载(稍微麻烦一丢丢)-优雅草卓伊凡
【02】仿站技术之python技术,看完学会再也不用去购买收费工具了-本次找了小影-感觉页面很好看-本次是爬取vue需要用到Puppeteer库用node.js扒一个app下载落地页-包括安卓android下载(简单)-ios苹果plist下载(稍微麻烦一丢丢)-优雅草卓伊凡
159 7
【02】仿站技术之python技术,看完学会再也不用去购买收费工具了-本次找了小影-感觉页面很好看-本次是爬取vue需要用到Puppeteer库用node.js扒一个app下载落地页-包括安卓android下载(简单)-ios苹果plist下载(稍微麻烦一丢丢)-优雅草卓伊凡
|
4月前
|
数据采集 XML JavaScript
Python爬虫:从人民网提取视频链接的完整指南
Python爬虫:从人民网提取视频链接的完整指南
|
5月前
|
前端开发
【2025优雅草开源计划进行中01】-针对web前端开发初学者使用-优雅草科技官网-纯静态页面html+css+JavaScript可直接下载使用-开源-首页为优雅草吴银满工程师原创-优雅草卓伊凡发布
【2025优雅草开源计划进行中01】-针对web前端开发初学者使用-优雅草科技官网-纯静态页面html+css+JavaScript可直接下载使用-开源-首页为优雅草吴银满工程师原创-优雅草卓伊凡发布
141 1
【2025优雅草开源计划进行中01】-针对web前端开发初学者使用-优雅草科技官网-纯静态页面html+css+JavaScript可直接下载使用-开源-首页为优雅草吴银满工程师原创-优雅草卓伊凡发布
|
4月前
|
JavaScript 前端开发 API
纯js轻量级页面顶部Loading进度条插件
纯js轻量级页面顶部Loading进度条插件
|
4月前
|
机器学习/深度学习 存储 设计模式
Python 高级编程与实战:深入理解性能优化与调试技巧
本文深入探讨了Python的性能优化与调试技巧,涵盖profiling、caching、Cython等优化工具,以及pdb、logging、assert等调试方法。通过实战项目,如优化斐波那契数列计算和调试Web应用,帮助读者掌握这些技术,提升编程效率。附有进一步学习资源,助力读者深入学习。
|
1月前
|
Python
Python编程基石:整型、浮点、字符串与布尔值完全解读
本文介绍了Python中的四种基本数据类型:整型(int)、浮点型(float)、字符串(str)和布尔型(bool)。整型表示无大小限制的整数,支持各类运算;浮点型遵循IEEE 754标准,需注意精度问题;字符串是不可变序列,支持多种操作与方法;布尔型仅有True和False两个值,可与其他类型转换。掌握这些类型及其转换规则是Python编程的基础。
161 33
|
12天前
|
数据采集 分布式计算 大数据
不会Python,还敢说搞大数据?一文带你入门大数据编程的“硬核”真相
不会Python,还敢说搞大数据?一文带你入门大数据编程的“硬核”真相
33 1