一、parsel类库使用
parsel这个库可以解析HTML和XML,并支持使用Xpath和CSS选择器对内容进行提取和修改,同时还融合了正则表达式的提取功能。parsel灵活且强大,同时也是python最流行的爬虫框架Scrapy的底层支持
# coding=utf-8 """ 作者:gaojs 功能: 新增功能: 日期:2022/3/25 19:35 """ import os.path import requests import parsel def get_address(): """ 获取url地址 :return: """ dirname = 'photo/' if not os.path.exists(dirname): os.mkdir(dirname) for page in range(2, 11): print(f'=====================正在爬取第{page}页内容========================') url = f'http://www.netbian.com/1920x1080/index_{page}.htm' # url = 'http://www.netbian.com/1920x1080/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46' } res = requests.get(url, headers=headers) res.encoding = res.apparent_encoding selector = parsel.Selector(res.text) href = selector.css('.list li a::attr(href)').getall() # <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼"> url_lis = selector.css('.list li') for lis in url_lis: title = lis.css('b::text').get() # 取出广告页面 if title: list_url = 'http://www.netbian.com' + lis.css('a::attr(href)').get() # print(list_url) res1 = requests.get(list_url, headers=headers) # print(res1.text) selector1 = parsel.Selector(res1.text) img_url = selector1.css('.pic img::attr(src)').get() # print(img_url) # 保存图片 img_content = requests.get(url=img_url).content with open('photo/' + title + '.jpg', 'wb') as f: f.write(img_content) print(title, img_url) get_address()