一、Requests库
参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request
Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求
安装方式一般采用 pip install requests
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
In [
1
]:
import
requests
In [
2
]: response
=
requests.get(
'https://api.github.com/events'
)
In [
3
]:
print
(response)
<Response [
200
]>
In [
4
]: response
=
requests.post(
'http://httpbin.org/post'
,data
=
{
'key1'
:
'values1'
})
#提交表单时使用
In [
5
]:
print
(response)
<Response [
200
]>
In [
7
]: response
=
requests.put(
'http://httpbin.org/put'
,data
=
{
'key1'
:
'values1'
})
In [
8
]:
print
(response)
<Response [
200
]>
In [
10
]: response
=
requests.delete(
'http://httpbin.org/delete'
)
In [
11
]:
print
(response)
<Response [
200
]>
In [
13
]: response
=
requests.head(
'http://httpbin.org/get'
)
In [
14
]:
print
(response)
<Response [
200
]>
In [
15
]: response
=
requests.options(
'http://httpbin.org/get'
)
In [
16
]:
print
(response)
<Response [
200
]>
In [
17
]: payload
=
{
'key1'
:
'value1'
,
'key2'
:
'value2'
}
In [
18
]: response
=
requests.get(
'http://httpbin.org/get'
,params
=
payload)
#携带参数发送get请求
In [
19
]:
print
(response)
<Response [
200
]>
In [
20
]:
print
(response.text)
{
"args"
: {
"key1"
:
"value1"
,
"key2"
:
"value2"
},
"headers"
: {
"Accept"
:
"*/*"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"Connection"
:
"close"
,
"Host"
:
"httpbin.org"
,
"User-Agent"
:
"python-requests/2.18.4"
},
"origin"
:
"103.215.2.233"
,
"url"
:
"http://httpbin.org/get?key1=value1&key2=value2"
}
In [
22
]:
print
(response.url)
http:
/
/
httpbin.org
/
get?key1
=
value1&key2
=
value2
In [
23
]: payload
=
{
'key1'
:
'value1'
,
'key2'
:[
'value2'
,
'value3'
]}
In [
24
]: response
=
requests.get(
'http://httpbin.org/get'
,params
=
payload)
In [
25
]:
print
(response.url)
http:
/
/
httpbin.org
/
get?key1
=
value1&key2
=
value2&key2
=
value3
In [
27
]: response
=
requests.get(
'http://api.github.com/events'
)
In [
28
]: response.encoding
#字符集编码
Out[
28
]:
'utf-8'
In [
29
]:
print
(response.text)
#文件信息
[{
"id"
:
"6850814749"
,
"type"
:
"CreateEvent"
,
"actor"
:{
"id"
:
679017
,
"login"
:......
In [
30
]:
print
(response.content)
#二进制格式信息
b'[{
"id"
:
"6850814749"
,
"type"
:
"CreateEvent"
,
"actor"
:{
"id"
:
679017
,
"login"
:".....
In [
34
]: response.json()
In [
36
]: response.status_code
#返回状态码
Out[
36
]:
200
In [
38
]: headers
=
{
'User-Agent'
:'Mozilla
/
5.0
(Macintosh; Intel Mac OS X
10_11_6
) AppleWebKit
/
537.36
(KHTML, like Gecko) Chrome
/
62.
...:
0.3202
.
75
Safari
/
537.36
','
Accept
':'
text
/
html,application
/
xhtml
+
xml,application
/
xml;q
=
0.9
,image
/
webp,image
/
apng,
*
/
*
;q
=
0.8
'
...: ,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8'
,
'Connection'
:
'keep-alive'
}
In [
39
]: response
=
requests.get(
'https://api.github.com/events'
,headers
=
headers)
In [
40
]:
print
(response.headers)
{
'Server'
:
'GitHub.com'
,
'Date'
:
'Tue, 14 Nov 2017 06:10:31 GMT'
,
'Content-Type'
:
'application/json; charset=utf-8'
,
'Transfer-Encoding'
:
'chunked'
,
'Status'
:
'200 OK'
,
'X-RateLimit-Limit'
:
'60'
,
'X-RateLimit-Remaining'
:
'58'
,
'X-RateLimit-Reset'
:
'1510642339'
,
'Cache-Control'
:
'public, max-age=60, s-maxage=60'
,
'Vary'
:
'Accept'
,
'ETag'
:
'W/"34b51a08c5a8f4fa2400dd5c0d89221b"'
,
'Last-Modified'
:
'Tue, 14 Nov 2017 06:10:31 GMT'
,
'X-Poll-Interval'
:
'60'
,
'X-GitHub-Media-Type'
:
'unknown, github.v3'
,
'Link'
:
'<https://api.github.com/events?page=2>; rel="next", <https://api.github.com/events?page=10>; rel="last"'
,
'Access-Control-Expose-Headers'
:
'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval'
,
'Access-Control-Allow-Origin'
:
'*'
,
'Content-Security-Policy'
:
"default-src 'none'"
,
'Strict-Transport-Security'
:
'max-age=31536000; includeSubdomains; preload'
,
'X-Content-Type-Options'
:
'nosniff'
,
'X-Frame-Options'
:
'deny'
,
'X-XSS-Protection'
:
'1; mode=block'
,
'X-Runtime-rack'
:
'0.104190'
,
'Content-Encoding'
:
'gzip'
,
'X-GitHub-Request-Id'
:
'D528:C0F5:6BAAA:E4CB6:5A0A88D6'
}
In [
41
]:
In [
43
]:
print
(response.headers[
'Content-Type'
])
application
/
json; charset
=
utf
-
8
In [
44
]:
print
(response.headers.get(
'Content-Type'
))
application
/
json; charset
=
utf
-
8
In [
45
]: url
=
'http://www.baidu.com'
In [
46
]: response
=
requests.get(url,headers
=
headers)
#向baidu请求会有cookies返回,有些site没有cookies
In [
47
]:
print
(response.cookies)
#输出整个cookies
<RequestsCookieJar[<Cookie H_PS_PSSID
=
1425_21088_24880
for
.baidu.com
/
>, <Cookie BDSVRTM
=
0
for
www.baidu.com
/
>, <Cookie BD_HOME
=
0
for
www.baidu.com
/
>]>
In [
48
]:
for
k,v
in
response.cookies.get_dict().items():
#遍历cookies内容
...:
print
(k,v)
...:
H_PS_PSSID
1425_21088_24880
BDSVRTM
0
BD_HOME
0
In [
49
]: cookies
=
{
'c1'
:
'v1'
,
'c2'
:
'v2'
}
In [
50
]: response
=
requests.get(
'http://httpbin.org/cookies'
,cookies
=
cookies)
#携带cookies发送请求
In [
52
]:
print
(response.text)
{
"cookies"
: {
"c1"
:
"v1"
,
"c2"
:
"v2"
}
}
In [
53
]: jar
=
requests.cookies.RequestsCookieJar()
In [
54
]: jar.
set
(
'tasty_cookie'
,
'yum'
, domain
=
'httpbin.org'
, path
=
'/cookies'
)
Out[
54
]: Cookie(version
=
0
, name
=
'tasty_cookie'
, value
=
'yum'
, port
=
None
, port_specified
=
False
, domain
=
'httpbin.org'
, domain_specified
=
True
, domain_initial_dot
=
False
, path
=
'/cookies'
, path_specified
=
True
, secure
=
False
, expires
=
None
, discard
=
True
, comment
=
None
, comment_url
=
None
, rest
=
{
'HttpOnly'
:
None
}, rfc2109
=
False
)
In [
55
]: jar.
set
(
'gross_cookie'
,
'blech'
, domain
=
'httpbin.org'
, path
=
'/elsewhere'
)
Out[
55
]: Cookie(version
=
0
, name
=
'gross_cookie'
, value
=
'blech'
, port
=
None
, port_specified
=
False
, domain
=
'httpbin.org'
, domain_specified
=
True
, domain_initial_dot
=
False
, path
=
'/elsewhere'
, path_specified
=
True
, secure
=
False
, expires
=
None
, discard
=
True
, comment
=
None
, comment_url
=
None
, rest
=
{
'HttpOnly'
:
None
}, rfc2109
=
False
)
In [
56
]: url
=
'http://httpbin.org/cookies'
In [
57
]: response
=
requests.get(url, cookies
=
jar)
In [
58
]:
print
(response.text)
{
"cookies"
: {
"tasty_cookie"
:
"yum"
}
}
|
Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests
1
2
3
4
5
6
7
8
9
10
11
12
|
In [
62
]: url
=
'http://github.com'
In [
64
]: response
=
requests.get(url,allow_redirects
=
True
)
In [
65
]:
print
(response.url)
https:
/
/
github.com
/
In [
66
]: response.history
Out[
66
]: [<Response [
301
]>]
In [
69
]: url
=
'http://httpbin.org/post'
In [
70
]: files
=
{
'file'
:
open
(
'test.txt'
,
'rb'
)}
In [
71
]: response
=
requests.post(url,files
=
files)
#post提交时携带文件
In [
72
]: response.text
Out[
72
]:
'...文件的内容...'
In [
73
]: response
=
requests.get(
'https://github.com'
, timeout
=
5
)
#关于请求超时
|
import json
import requests
from io import BytesIO
from PIL import Image
#1 处理图片
1
2
3
|
r
=
requests.get(
'http://img.jrjimg.cn/2013/11/20131105065502114.jpg'
)
image
=
Image.
open
(BytesIO(r.content))
#从图片的二进制内容 生成一张图片
image.save(
'mm.jpg'
)
|
#2 Json 处理josn
1
2
3
4
|
r
=
requests.get(
'https://github.com/timeline.json'
)
print
(
type
(r.json))
print
(r.json)
print
(r.text)
|
#3 org data 处理源数据
1
2
3
4
|
r
=
requests.get(
'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg'
)
with
open
(
'mm2.jpg'
,
'wb+'
) as f:
for
chunk
in
r.iter_content(
1024
):
f.write(chunk)
|
#4 Form 处理表单
1
2
3
4
5
|
form
=
{
'username'
:
'user'
,
'password'
:
'pwd'
}
r
=
requests.post(
'http://httpbin.org/post'
,data
=
form)
print
(r.text)
r
=
requests.post(
'http://httpbin.org/post'
,data
=
json.dumps(form))
print
(r.text)
|
二、通过Requests抓取豆瓣电影列表及评分
所以抓取代码如下: