- 需求:
1.爬取贴吧名称 ,以海贼王为例
2.要进行翻页爬取(起始页,中止页)
3.把每一页的内容保存到本地
页面分析
分析url 翻页爬取的时候:大多数情况下是需要分析url的规律
找出海贼王贴吧前三页的url如下:
https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&fr=search(第一页)
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=50(第二页)
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=100(第三页)
经验证,第一页的也可以写作:
https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=0(第一页)
使用input(),从控制台输入想要查找的内容
代码一般写法
import urllib.request
import urllib.parse
name = input("请输入贴吧名称")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
# 起始页
begin = int(input("请输入开始页"))
# 结束页
end = int(input("请输入结束页"))
kw = {
'kw': name}
result = urllib.parse.urlencode(kw)
# 拼接url
for i in range(begin, end+1): # 整数序列,range范围是左闭右开
pn = (i-1)*50
# print(pn)
base_url = 'https://tieba.baidu.com/f?'
url = base_url + result + '&pn=' + str(pn)
# 发请求
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req) # 发起请求,获得响应
# print(res.getcode()) # 得到响应码,200表示请求成功
html = res.read().decode('utf-8')
# print(html)
# 写入文件
filename = '第'+str(i) + '页.html'
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print(f'正在爬取第{
i}页')
函数写法
import urllib.request
import urllib.parse
# 分析功能:读取页面,写入文件,主函数
def readPage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
# 发请求
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req) # 发起请求,获得响应
# print(res.getcode()) # 得到响应码,200表示请求成功
html = res.read().decode('utf-8')
return html
def writePage(filename, html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
def main():
name = input("请输入贴吧名称")
# 起始页
begin = int(input("请输入开始页"))
end = int(input("请输入结束页"))
# 中文网址有问题,需要对name进行处理
kw = {
'kw': name}
result = urllib.parse.urlencode(kw)
# 拼接url
for i in range(begin, end + 1): # 整数序列,range范围是左闭右开
pn = (i - 1) * 50
# print(pn)
base_url = 'https://tieba.baidu.com/f?'
url = base_url + result + '&pn=' + str(pn)
# 调用函数
html = readPage(url)
filename = '第'+str(i) + '页.html'
writePage(filename, html)
if __name__ == '__main__': # 程序主入口
main()
面向对象写法
import urllib.request
import urllib.parse
class BaiduSpider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
self.base_url = 'https://tieba.baidu.com/f?'
def readPage(self, url):
# 发请求
req = urllib.request.Request(url, headers=self.headers)
res = urllib.request.urlopen(req) # 发起请求,获得响应
# print(res.getcode()) # 得到响应码,200表示请求成功
html = res.read().decode('utf-8')
return html
def writePage(self, filename, html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
def main(self):
name = input("请输入贴吧名称")
# 起始页
begin = int(input("请输入开始页"))
end = int(input("请输入结束页"))
# 中文网址有问题,需要对name进行处理
kw = {
'kw': name}
result = urllib.parse.urlencode(kw)
# 拼接url
for i in range(begin, end + 1): # 整数序列,range范围是左闭右开
pn = (i - 1) * 50
# print(pn)
url = self.base_url + result + '&pn=' + str(pn)
# 调用函数
html = self.readPage(url)
filename = '第' + str(i) + '页.html'
self.writePage(filename, html)
if __name__ == '__main__':
spider = BaiduSpider() # 创建实例对象
spider.main()