1.什么是Urllib
python内置的HTTP请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt 解析模块
2.相对于Python2的变化
Python2
import urllib2
response = urllib2.urlopen(“http://www.baidu.com”)
Python3
import urllib.request
response = urllib.request.urlopen(“http://www.baidu.com”)
3.urllib用法讲解
3.1urlopen
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
3.1.1实现网页get请求
import urllib.request response = urllib.request.urlopen('http://www.baidu.com') print(response.read().decode('utf-8'))
3.1.2 请求时加载数据
import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data) print(response.read())
3.1.3超时请求设置
#超时请求设置 import urllib.request response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) print(response.read())
3.1.4判断超时请求
#判断超时请求 import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print('TIME OUT')
3.2响应
3.2.1响应类型
import urllib.request response = urllib.request.urlopen('https://www.baidu.com') print(type(response))
3.2.2状态码、响应头
import urllib.request response = urllib.request.urlopen('https://www.baidu.com') print(response.status)#状态码 print(response.getheaders())#所有显示头 print(response.getheader('Server'))
import urllib.request response = urllib.request.urlopen('https://www.baidu.com') print(response.read().decode('utf-8')) #网页的源代码
3.3Request
3.3.1使用Request来请求网页
import urllib.request request = urllib.request.Request('https://www.baidu.com') response = urllib.request.urlopen(request) print(response.read().decode('utf-8'))
3.3.2通过Request请求参数来请求网页
from urllib import request, parse # 通过这是请求参数来请求网页 url = 'http://httpbin.org/post' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') #formdata req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))
3.3.2通过Request.add_header方法来请求网页
from urllib import request, parse url = 'http://httpbin.org/post' dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, method='POST') #通过Request.add_header方法来请求网页 req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = request.urlopen(req) print(response.read().decode('utf-8'))
3.4Handler
3.4.1代理:使用代理,来伪装IP地址
#handler相当于一个额外的工具,来处理一些设置 import urllib.request #使用代理,来伪装IP地址 proxy_handler = urllib.request.ProxyHandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
3.4.2Cookie:cookies用来维持登录状态
import http.cookiejar, urllib.request #cookies用来维持登录状态 cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') for item in cookie: print(item.name+"="+item.value)
import http.cookiejar, urllib.request #cookies保存成文本文件,保存格式mozilla filename = "cookie.txt" cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True)
import http.cookiejar, urllib.request #cookies保存成文本文件,保存格式lwp filename = 'cookie.txt' cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True)
import http.cookiejar, urllib.request #读取cookies cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
3.5异常处理
3.5.1读出异常
from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e: print(e.reason)
from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully')
import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print('TIME OUT')
3.6URL解析
3.6.1urlparse:将网址拆分成不同意义的字典
urllib.parse.urlparse(urlstring, scheme=’’, allow_fragments=True)
#网址,协议类型,锚点链接
from urllib.parse import urlparse #解析网址 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(type(result), result)
from urllib.parse import urlparse result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https') print(result)
from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https') print(result)
from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False) print(result)
from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) print(result)
3.6.2urlunparse:urlparse的反编译
from urllib.parse import urlunparse #urlparse的反函数 data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] print(urlunparse(data))
3.6.3urljoin:拼接网址
from urllib.parse import urljoin #用来拼接url print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc', 'https://baidu.com/index.php')) print(urljoin('http://www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com#comment', '?category=2'))
3.6.4urlencode:把一个字典对象转换成get请求参数
from urllib.parse import urlencode #把一个字典对象转换成get请求参数 params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params) print(url)