可参考:
Urllib库的基本使用
官方文档:https://docs.python.org/3/library/urllib.html
urllib库包含以下模块 urllib.request 请求模块 urllib.error 异常处理模块 urllib.parse url解析模块 urllib.robotparser robots.txt解析模块
py2 vs. py3
python2 urllib.urlopen() python3 urllin.request.urlopen()
用于http测试的网站:http://httpbin.org/
引入需要的模块
from urllib import request from urllib import parse from urllib import error from http import cookiejar import socket
request请求
请求url,请求参数, 请求数据, 请求头
urlopen urlopen(url, data=None, timeout, *, cafile=None, capath=None, cadefault=False, context=None)
# 发送get请求 def foo1(): response = request.urlopen("http://www.baidu.com") # 字节 -> utf-8解码 -> 字符串 print(response.read().decode("utf-8")) # 发送post请求 def foo2(): data = bytes(parse.urlencode({"word": "hello"}), encoding="utf-8") response = request.urlopen("http://httpbin.org/post", data=data) print(response.read()) # 设置超时时间并捕获异常 def foo3(): try: response = request.urlopen("http://httpbin.org/post", timeout=0.1) print(response.read()) except error.URLError as e: print(type(e.reason)) # <class 'socket.timeout'> if isinstance(e.reason, socket.timeout): print("超时错误:", e)
response响应
# 状态码,响应头 def foo4(): response = request.urlopen("http://www.baidu.com") print(type(response)) # from http.client import HTTPResponse # <class 'http.client.HTTPResponse'> print(response.status) print(response.getheaders()) print(response.getheader("Server"))
Request请求对象
def foo5(): req = request.Request("http://www.baidu.com") response = request.urlopen(req) print(response.read().decode("utf-8")) # 带浏览器信息的请求1 def foo6(): url = "http://httpbin.org/post" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)", "Host": "httpbin.org" } dct = {"name": "Tom"} data = bytes(parse.urlencode(dct), encoding="utf-8") req = request.Request(url=url, data=data, headers=headers) response = request.urlopen(req) print(response.read().decode("utf-8")) # 带浏览器信息的请求2 def foo7(): url = "http://httpbin.org/post" dct = {"name": "Tom"} data = bytes(parse.urlencode(dct), encoding="utf-8") req = request.Request(url=url, data=data, method="POST") req.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)") response = request.urlopen(req) print(response.read().decode("utf-8"))
代理
def foo8(): proxy_handler = request.ProxyHandler({ "http": "http://183.159.94.185:18118", "https": "https://183.159.94.187:18118", }) opener = request.build_opener(proxy_handler) response = opener.open("http://www.baidu.com") print(response.read())
cookie
def foo9(): cookie = cookiejar.CookieJar() cookie_handler = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_handler) response = opener.open("http://www.baidu.com") print(response.status) for item in cookie: print(item.name, item.value) # 保存cookie1 def foo10(): filename = "cookie.txt" cookie = cookiejar.MozillaCookieJar(filename) cookie_handler = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_handler) response = opener.open("http://www.baidu.com") cookie.save(ignore_discard=True, ignore_expires=True) # 保存cookie2 def foo11(): filename = "cookie1.txt" cookie = cookiejar.LWPCookieJar(filename) cookie_handler = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_handler) response = opener.open("http://www.baidu.com") cookie.save(ignore_discard=True, ignore_expires=True) # 读取cookie def foo12(): filename = "cookie1.txt" cookie = cookiejar.LWPCookieJar() cookie.load(filename, ignore_discard=True, ignore_expires=True) cookie_handler = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_handler) response = opener.open("http://www.baidu.com") print(response.read().decode("utf-8"))
异常处理
error主要有:’URLError’, ‘HTTPError’, ‘ContentTooShortError’
def foo13(): try: response = request.urlopen("http://www.xxooxxooxox.com/xxx") print(response.status) except error.HTTPError as e: # 子类异常 print(e.name, e.reason, e.code, e.headers, sep="\n") except error.URLError as e: # 父类异常 print(e.reason) else: print("successful")
parse 模块解析url
urlparse(url, scheme='', allow_fragments=True)
def foo14(): result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment") print(type(result), result, sep="\n") """ <class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', params='user', query='id=5', fragment='comment') """ # scheme 为默认协议信息 链接中协议信息优先 result = parse.urlparse("www.baidu.com", scheme="https") print(result) """ ParseResult(scheme='https', netloc='', path='www.baidu.com', params='', query='', fragment='') """ result = parse.urlparse("http://www.baidu.com", scheme="https") print(result) """ ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='') """ # allow_fragments 参数决定锚点拼接的位置 result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment", allow_fragments=True) print(result) """ ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', params='user', query='id=5', fragment='comment') """ result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment", allow_fragments=False) print(result) """ ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', params='user', query='id=5#comment', fragment='') """ result = parse.urlparse("http://www.baidu.com/xxx.html;user#comment", allow_fragments=False) print(result) """ ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', params='user#comment', query='', fragment='') """ # urlunparse 拼接url链接,注意顺序 def foo15(): data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"] print(parse.urlunparse(data)) # http://www.baidu.com/index.html;user?a=6#comment # urljoin 拼接url,类似os.path.join, 后者优先级高 def foo16(): print(parse.urljoin("http://www.baidu.com", "index.html")) print(parse.urljoin("http://www.baidu.com", "http://www.qq.com/index.html")) print(parse.urljoin("http://www.baidu.com/index.html", "http://www.qq.com/?id=6")) """ http://www.baidu.com/index.html http://www.qq.com/index.html http://www.qq.com/?id=6 """ # urlencode将字典转为url中的参数形式 def foo17(): params ={ "name": "Tom", "age": 18 } # 这里 ? 没了 url = parse.urljoin("http://www.baidu.com/?", parse.urlencode(params)) print(url) # http://www.baidu.com/name=Tom&age=18 url = "http://www.baidu.com/?" + parse.urlencode(params) print(url) # http://www.baidu.com/?name=Tom&age=18