实战
爬虫大致思路
第一步:请求网络链接先获取到网站返回数据
第二步:这里我选用了正则表达式结合xpath进行数据解析
第三步:持久化 保存数据
源文件总览
这是我很久之前写的代码;测试了一下还可以用。大家根据我写的代码可以自行查找一下 ,还是老规矩,通过F12抓包工具,分析网页结构,获取数据 。
import re
import requests
from lxml import etree
import time
menu = {1:'旗帜',2:'新知',3:'旅行',4:'体育',5:'生活',6:'科技',7:'娱乐',8:'汽车',9:'美食',10:'音乐'}
def request(url,r_url='https://www.pearvideo.com/'):
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
'Referer': r_url}
r = requests.get(url, headers=ua)
return r
def analysis(r):
soup = etree.HTML(r)
list_1 = soup.xpath('//*[@id="listvideoListUl"]/li')
list_2 = soup.xpath('//*[@id="categoryList"]/li')
spider(list_1)
spider(list_2)
def spider(list):
for i in list:
r_url = 'https://www.pearvideo.com/' + i.xpath('./div/a/@href')[0]
title = i.xpath('./div/a/div[2]/text()')[0]
id = str(i.xpath('./div/a/@href')[0]).replace('video_','')
video_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + id + '&mrd=0.27731227756239263'
l = request(video_url,r_url).text
try:
time.sleep(1)
url = re.findall('"srcUrl":"(.*?)"',l)[0]
url = url.replace(re.findall('/(162.*?)-',url)[0],'cont-'+id)
video = request(url,r_url).content
write(title,video)
print(f'正在爬取{title},爬取成功!')
except:
print(url)
continue
def spider_2(num,page):
for i in range(12,12*page+1,12):
url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=' + num + '&start=' + str(i) + '&mrd=0.9948502649054862'
soup = etree.HTML(request(url).text)
list = soup.xpath('/html/body/li')
spider(list)
def write(title,video):
with open("梨_短视频/"+title+'.mp4','wb') as f:
f.write(video)
if __name__ == '__main__':
for key,value in menu.items():
print(f'{key}:{value}',end=' ')
num = input('\n请选择要爬取的类型:')
page = eval(input('请输入爬取页数(一页12个视频):'))
spider_2(num,page)
如果学习上有遇到问题,想联系我可以加v:yiyi990805(备注:阿里云tony)即可。