一、Requests库
参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request
Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求
安装方式一般采用 pip install requests
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
In [
1
]:
import
requests
In [
2
]: response
=
requests.get(
'https://api.github.com/events'
)
In [
3
]:
print
(response)
<Response [
200
]>
In [
4
]: response
=
requests.post(
'http://httpbin.org/post'
,data
=
{
'key1'
:
'values1'
})
#提交表单时使用
In [
5
]:
print
(response)
<Response [
200
]>
In [
7
]: response
=
requests.put(
'http://httpbin.org/put'
,data
=
{
'key1'
:
'values1'
})
In [
8
]:
print
(response)
<Response [
200
]>
In [
10
]: response
=
requests.delete(
'http://httpbin.org/delete'
)
In [
11
]:
print
(response)
<Response [
200
]>
In [
13
]: response
=
requests.head(
'http://httpbin.org/get'
)
In [
14
]:
print
(response)
<Response [
200
]>
In [
15
]: response
=
requests.options(
'http://httpbin.org/get'
)
In [
16
]:
print
(response)
<Response [
200
]>
In [
17
]: payload
=
{
'key1'
:
'value1'
,
'key2'
:
'value2'
}
In [
18
]: response
=
requests.get(
'http://httpbin.org/get'
,params
=
payload)
#携带参数发送get请求
In [
19
]:
print
(response)
<Response [
200
]>
In [
20
]:
print
(response.text)
{
"args"
: {
"key1"
:
"value1"
,
"key2"
:
"value2"
},
"headers"
: {
"Accept"
:
"*/*"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"Connection"
:
"close"
,
"Host"
:
"httpbin.org"
,
"User-Agent"
:
"python-requests/2.18.4"
},
"origin"
:
"103.215.2.233"
,
"url"
:
"http://httpbin.org/get?key1=value1&key2=value2"
}
In [
22
]:
print
(response.url)
http:
/
/
httpbin.org
/
get?key1
=
value1&key2
=
value2
In [
23
]: payload
=
{
'key1'
:
'value1'
,
'key2'
:[
'value2'
,
'value3'
]}
In [
24
]: response
=
requests.get(
'http://httpbin.org/get'
,params
=
payload)
In [
25
]:
print
(response.url)
http:
/
/
httpbin.org
/
get?key1
=
value1&key2
=
value2&key2
=
value3
In [
27
]: response
=
requests.get(
'http://api.github.com/events'
)
In [
28
]: response.encoding
#字符集编码
Out[
28
]:
'utf-8'
In [
29
]:
print
(response.text)
#文件信息
[{
"id"
:
"6850814749"
,
"type"
:
"CreateEvent"
,
"actor"
:{
"id"
:
679017
,
"login"
:......
In [
30
]:
print
(response.content)
#二进制格式信息
b'[{
"id"
:
"6850814749"
,
"type"
:
"CreateEvent"
,
"actor"
:{
"id"
:
679017
,
"login"
:".....
In [
34
]: response.json()
In [
36
]: response.status_code
#返回状态码
Out[
36
]:
200
In [
38
]: headers
=
{
'User-Agent'
:'Mozilla
/
5.0
(Macintosh; Intel Mac OS X
10_11_6
) AppleWebKit
/
537.36
(KHTML, like Gecko) Chrome
/
62.
...:
0.3202
.
75
Safari
/
537.36
','
Accept
':'
text
/
html,application
/
xhtml
+
xml,application
/
xml;q
=
0.9
,image
/
webp,image
/
apng,
*
/
*
;q
=
0.8
'
...: ,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8'
,
'Connection'
:
'keep-alive'
}
In [
39
]: response
=
requests.get(
'https://api.github.com/events'
,headers
=
headers)
In [
40
]:
print
(response.headers)
{
'Server'
:
'GitHub.com'
,
'Date'
:
'Tue, 14 Nov 2017 06:10:31 GMT'
,
'Content-Type'
:
'application/json; charset=utf-8'
,
'Transfer-Encoding'
:
'chunked'
,
'Status'
:
'200 OK'
,
'X-RateLimit-Limit'
:
'60'
,
'X-RateLimit-Remaining'
:
'58'
,
'X-RateLimit-Reset'
:
'1510642339'
,
'Cache-Control'
:
'public, max-age=60, s-maxage=60'
,
'Vary'
:
'Accept'
,
'ETag'
:
'W/"34b51a08c5a8f4fa2400dd5c0d89221b"'
,
'Last-Modified'
:
'Tue, 14 Nov 2017 06:10:31 GMT'
,
'X-Poll-Interval'
:
'60'
,
'X-GitHub-Media-Type'
:
'unknown, github.v3'
,
'Link'
:
'<https://api.github.com/events?page=2>; rel="next", <https://api.github.com/events?page=10>; rel="last"'
,
'Access-Control-Expose-Headers'
:
'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval'
,
'Access-Control-Allow-Origin'
:
'*'
,
'Content-Security-Policy'
:
"default-src 'none'"
,
'Strict-Transport-Security'
:
'max-age=31536000; includeSubdomains; preload'
,
'X-Content-Type-Options'
:
'nosniff'
,
'X-Frame-Options'
:
'deny'
,
'X-XSS-Protection'
:
'1; mode=block'
,
'X-Runtime-rack'
:
'0.104190'
,
'Content-Encoding'
:
'gzip'
,
'X-GitHub-Request-Id'
:
'D528:C0F5:6BAAA:E4CB6:5A0A88D6'
}
In [
41
]:
In [
43
]:
print
(response.headers[
'Content-Type'
])
application
/
json; charset
=
utf
-
8
In [
44
]:
print
(response.headers.get(
'Content-Type'
))
application
/
json; charset
=
utf
-
8
In [
45
]: url
=
'http://www.baidu.com'
In [
46
]: response
=
requests.get(url,headers
=
headers)
#向baidu请求会有cookies返回,有些site没有cookies
In [
47
]:
print
(response.cookies)
#输出整个cookies
<RequestsCookieJar[<Cookie H_PS_PSSID
=
1425_21088_24880
for
.baidu.com
/
>, <Cookie BDSVRTM
=
0
for
www.baidu.com
/
>, <Cookie BD_HOME
=
0
for
www.baidu.com
/
>]>
In [
48
]:
for
k,v
in
response.cookies.get_dict().items():
#遍历cookies内容
...:
print
(k,v)
...:
H_PS_PSSID
1425_21088_24880
BDSVRTM
0
BD_HOME
0
In [
49
]: cookies
=
{
'c1'
:
'v1'
,
'c2'
:
'v2'
}
In [
50
]: response
=
requests.get(
'http://httpbin.org/cookies'
,cookies
=
cookies)
#携带cookies发送请求
In [
52
]:
print
(response.text)
{
"cookies"
: {
"c1"
:
"v1"
,
"c2"
:
"v2"
}
}
In [
53
]: jar
=
requests.cookies.RequestsCookieJar()
In [
54
]: jar.
set
(
'tasty_cookie'
,
'yum'
, domain
=
'httpbin.org'
, path
=
'/cookies'
)
Out[
54
]: Cookie(version
=
0
, name
=
'tasty_cookie'
, value
=
'yum'
, port
=
None
, port_specified
=
False
, domain
=
'httpbin.org'
, domain_specified
=
True
, domain_initial_dot
=
False
, path
=
'/cookies'
, path_specified
=
True
, secure
=
False
, expires
=
None
, discard
=
True
, comment
=
None
, comment_url
=
None
, rest
=
{
'HttpOnly'
:
None
}, rfc2109
=
False
)
In [
55
]: jar.
set
(
'gross_cookie'
,
'blech'
, domain
=
'httpbin.org'
, path
=
'/elsewhere'
)
Out[
55
]: Cookie(version
=
0
, name
=
'gross_cookie'
, value
=
'blech'
, port
=
None
, port_specified
=
False
, domain
=
'httpbin.org'
, domain_specified
=
True
, domain_initial_dot
=
False
, path
=
'/elsewhere'
, path_specified
=
True
, secure
=
False
, expires
=
None
, discard
=
True
, comment
=
None
, comment_url
=
None
, rest
=
{
'HttpOnly'
:
None
}, rfc2109
=
False
)
In [
56
]: url
=
'http://httpbin.org/cookies'
In [
57
]: response
=
requests.get(url, cookies
=
jar)
In [
58
]:
print
(response.text)
{
"cookies"
: {
"tasty_cookie"
:
"yum"
}
}
|
Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests
1
2
3
4
5
6
7
8
9
10
11
12
|
In [
62
]: url
=
'http://github.com'
In [
64
]: response
=
requests.get(url,allow_redirects
=
True
)
In [
65
]:
print
(response.url)
https:
/
/
github.com
/
In [
66
]: response.history
Out[
66
]: [<Response [
301
]>]
In [
69
]: url
=
'http://httpbin.org/post'
In [
70
]: files
=
{
'file'
:
open
(
'test.txt'
,
'rb'
)}
In [
71
]: response
=
requests.post(url,files
=
files)
#post提交时携带文件
In [
72
]: response.text
Out[
72
]:
'...文件的内容...'
In [
73
]: response
=
requests.get(
'https://github.com'
, timeout
=
5
)
#关于请求超时
|
import json
import requests
from io import BytesIO
from PIL import Image
#1 处理图片
1
2
3
|
r
=
requests.get(
'http://img.jrjimg.cn/2013/11/20131105065502114.jpg'
)
image
=
Image.
open
(BytesIO(r.content))
#从图片的二进制内容 生成一张图片
image.save(
'mm.jpg'
)
|
#2 Json 处理josn
1
2
3
4
|
r
=
requests.get(
'https://github.com/timeline.json'
)
print
(
type
(r.json))
print
(r.json)
print
(r.text)
|
#3 org data 处理源数据
1
2
3
4
|
r
=
requests.get(
'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg'
)
with
open
(
'mm2.jpg'
,
'wb+'
) as f:
for
chunk
in
r.iter_content(
1024
):
f.write(chunk)
|
#4 Form 处理表单
1
2
3
4
5
|
form
=
{
'username'
:
'user'
,
'password'
:
'pwd'
}
r
=
requests.post(
'http://httpbin.org/post'
,data
=
form)
print
(r.text)
r
=
requests.post(
'http://httpbin.org/post'
,data
=
json.dumps(form))
print
(r.text)
|
二、通过Requests抓取豆瓣电影列表及评分
所以抓取代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import
requests
from
lxml
import
etree
sess
=
requests.Session()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8'
,
'Connection'
:
'keep-alive'
}
for
id
in
range
(
0
,
250
,
25
):
url
=
'https://movie.douban.com/top250/?start='
+
str
(
id
)
r
=
sess.get(url,headers
=
headers)
r.encoding
=
'utf-8'
#fname="movie"+str(id)+".txt"
#with open(fname,"wb+") as f:
# f.write(r.content)
root
=
etree.HTML(r.content)
#使用lxml解析器对html文档解析
items
=
root.xpath(
'//ol/li/div[@class="item"]'
)
for
item
in
items:
title
=
item.xpath(
'./div[@class="info"]//a/span[@class="title"]/text()'
)
name
=
title[
0
].encode(
'gb2312'
,
'ignore'
).decode(
'gb2312'
)
rating
=
item.xpath(
'.//div[@class="bd"]//span[@class="rating_num"]/text()'
)[
0
]
rating
=
item.xpath(
'.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
)[
0
]
print
(name, rating)
|
三、BeautifulSoup
BeautifulSoup模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后便可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。Beautiful Soup 支持 Python 标准库中的 HTML 解析器,还支持一些第三方的解析器,如果不安装第三方解析器,Python 会使用默认的解析器。常见的解析器有:lxml, html5lib, 和 html.parser,其中lxml 解析器更加强大,速度更快,推荐安装。
1
2
3
|
from
bs4
import
BeautifulSoup
soup
=
BeautifulSoup(
open
(
'test.html'
))
#这种方式适用于打开本地文件进行解析
print
(soup.prettify())
#格式化输出
|
#1 Tag 处理tag
1
2
3
|
print
(
type
(soup.title))
print
(soup.title)
print
(soup.title.name)
|
#2 String
1
2
|
print
(
type
(soup.title.string))
print
(soup.title.string)
|
#3 Comment
1
2
3
4
|
print
(
type
(soup.a.string))
print
(soup.a.string)
for
item
in
soup.body.contents:
print
(item.name)
|
#4 CSS query
1
2
3
|
print
(soup.select(
'.sister'
))
print
(soup.select(
'#link1'
))
print
(soup.select(
'head > title'
))
|
1
2
3
|
a_s
=
soup.select(
'a'
)
for
a
in
a_s:
print
(a)
|
例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
from
bs4
import
BeautifulSoup
html_doc
=
"""
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
<div class="title">
<b>The Dormouse's story总共</b>
<h1>f</h1>
</div>
<div class="story">Once upon a time there were three little sisters; and their names were
<a class="sister0" id="link1">Els<span>f</span>ie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
|
1
2
3
4
|
soup
=
BeautifulSoup(html_doc, features
=
"lxml"
)
tag1
=
soup.find(name
=
'a'
)
#find first tag a
tag2
=
soup.find_all(name
=
'a'
)
#find all tag a
tag3
=
soup.select(
'#link2'
) #find
id
=
link2 label
|
1
2
3
4
5
6
7
8
9
|
print
(tag1.name)
# 输出 a
print
(tag1.attrs)
# 输出 字典 {'class': ['sister0'], 'id': 'link1'}
tag1.attrs[
'id'
]
=
'link01'
print
(tag1.attrs)
# 输出 字典 {'class': ['sister0'], 'id': 'link01'}
print
(tag1.has_attr(
'id'
))
# 输出 True
print
(tag1.get_text(
'id'
))
# 输出 Elsidfidie
tag1.name
=
'soup'
# 设置name 的值
print
(tag2)
# 输出 [<a class="sister0" id="link1">Els<span>f</span>ie</a>, ......]
print
(tag2[
0
].name)
# 输出 soup
|
# decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
1
2
3
4
|
print
(tag2[
1
])
# 输出 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print
(
type
(tag2[
1
]))
# 输出 <class 'bs4.element.Tag'>
print
(tag2[
1
].decode())
# 输出 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print
(
type
(tag2[
1
].decode()))
# 输出 <class 'str'>
|
# encode,转换为字节(含当前标签);encode_contents(不含当前标签)
1
2
3
|
print
(tag2[
1
].encode())
# 输出 b'<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>'
print
(
type
(tag2[
1
].encode()))
# 输出 <class 'bytes'>
print
(tag2[
1
].get_text())
# 输出 Lacie
|
1
2
3
4
5
|
body
=
soup.find(name
=
'body'
)
#所有子标签
childs
=
body.children
print
(childs)
# 输出 <list_iterator object at 0x10349b9e8>
for
tag
in
childs:
print
(tag)
|
1
2
3
4
5
|
body
=
soup.find(name
=
'body'
)
#所有子子孙孙标签
descs
=
body.descendants
# 输出 <generator object descendants at 0x106327360>
print
(descs)
for
des
in
descs:
print
(des)
|
1
2
3
|
body
=
soup.find(name
=
'body'
)
# 将标签的所有子标签全部清空 , 保留本标签名
body.clear()
print
(soup)
|
1
2
3
|
body
=
soup.find(name
=
'body'
)
body.decompose()
# 递归的删除所有的标签
print
(soup)
|
1
2
3
4
|
body
=
soup.find(name
=
'body'
)
d
=
body.extract()
# 递归的删除所有的标签,并获取删除的标签
print
(soup)
print
(d)
|
1
2
3
|
body
=
soup.find(name
=
'body'
)
index
=
body.index(body.find(
'div'
))
# 输出 1 , 检查标签在某标签中的索引位置
print
(index)
|
1
2
3
|
br
=
soup.find(name
=
'br'
)
test
=
br.is_empty_element
# 输出True ; 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
print
(test)
|
1
2
3
4
5
|
span
=
soup.find(
'span'
)
print
(span)
# 输出 <span>f</span>
print
(span.string)
# 输出 f
span.string
=
'yeecall.com'
# 设置 string
print
(span.string)
# 输出 yeecall.com
|
1
2
3
4
5
|
body
=
soup.find(name
=
'body'
)
texts
=
body.stripped_strings
# 递归内部获取所有标签的文本
print
(texts)
# 输出 <generator object stripped_strings at 0x107311360>
for
text
in
texts:
print
(text)
|
# Select CSS 选择器的举例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
soup.select(
"title"
)
soup.select(
"p nth-of-type(3)"
)
soup.select(
"body a"
)
soup.select(
"html head title"
)
tag
=
soup.select(
"span,a"
)
soup.select(
"head > title"
)
soup.select(
"p > a"
)
soup.select(
"p > a:nth-of-type(2)"
)
soup.select(
"p > #link1"
)
soup.select(
"body > a"
)
soup.select(
"#link1 ~ .sister"
)
soup.select(
"#link1 + .sister"
)
soup.select(
".sister"
)
soup.select(
"[class~=sister]"
)
soup.select(
"#link1"
)
soup.select(
"a#link2"
)
soup.select(
'a[href]'
)
soup.select(
'a[href="http://example.com/elsie"]'
)
soup.select(
'a[href^="http://example.com/"]'
)
soup.select(
'a[href$="tillie"]'
)
soup.select(
'a[href*=".com/el"]'
)
|
四、使用requests 、BeautifulSoup实现豆瓣登录
登录的部分源html如下:
验证码的部分源html如下:
所以登录代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import
requests
import
html5lib
import
re
from
bs4
import
BeautifulSoup
sess
=
requests.Session()
url_login
=
'https://accounts.douban.com/login'
formdata
=
{
'redir'
:
'https://www.douban.com'
,
'source'
:
'index_nav'
,
'form_email'
:
'******@*****.com'
,
'form_password'
:
'*********'
,
'login'
:u
'登录'
}
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
r
=
sess.post(url_login,data
=
formdata,headers
=
headers)
content
=
r.text
soup
=
BeautifulSoup(content,
'html5lib'
)
captcha
=
soup.find(
'img'
,
id
=
'captcha_image'
)
if
captcha:
print
(captcha)
captcha_url
=
captcha[
'src'
]
#re_captcha_id=r'id="(.*?)"&'
#captcha_id=re.findall(re_captcha_id,captcha)
captcha_id
=
re.findall(r
'(id=)(.*)(&)'
,captcha_url)
captcha_id
=
captcha_id[
0
][
1
]
print
(captcha_url)
print
(captcha_id)
captcha_text
=
input
(
'Please input the captcha:'
)
formdata[
'captcha-solution'
]
=
captcha_text
formdata[
'captcha-id'
]
=
captcha_id
print
(formdata)
r
=
sess.post(url_login,data
=
formdata,headers
=
headers)
with
open
(
'contacts.txt'
,
'w+'
,encoding
=
'utf-8'
) as f:
f.write(r.text)
|
以上仅为个人学习笔记,高手指点且勿喷