Requests 与 BeautifulSoup 模块

简介:

一、Requests

参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request

Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求

安装方式一般采用 pip install requests

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
In [ 1 ]:  import  requests
In [ 2 ]: response = requests.get( 'https://api.github.com/events' )
In [ 3 ]:  print (response)
<Response [ 200 ]>
In [ 4 ]: response = requests.post( 'http://httpbin.org/post' ,data = { 'key1' : 'values1' })          #提交表单时使用
In [ 5 ]:  print (response)
<Response [ 200 ]>
In [ 7 ]: response = requests.put( 'http://httpbin.org/put' ,data = { 'key1' : 'values1' })
In [ 8 ]:  print (response)
<Response [ 200 ]>
In [ 10 ]: response = requests.delete( 'http://httpbin.org/delete' )
In [ 11 ]:  print (response)
<Response [ 200 ]>
In [ 13 ]: response = requests.head( 'http://httpbin.org/get' )
In [ 14 ]:  print (response)
<Response [ 200 ]>
In [ 15 ]: response = requests.options( 'http://httpbin.org/get' )  
In [ 16 ]:  print (response)
<Response [ 200 ]>
In [ 17 ]: payload = { 'key1' : 'value1' , 'key2' : 'value2' }
In [ 18 ]: response = requests.get( 'http://httpbin.org/get' ,params = payload)    #携带参数发送get请求
In [ 19 ]:  print (response)
<Response [ 200 ]>
In [ 20 ]:  print (response.text)
{
   "args" : {
     "key1" "value1" ,
     "key2" "value2"
   },
   "headers" : {
     "Accept" "*/*" ,
     "Accept-Encoding" "gzip, deflate" ,
     "Connection" "close" ,
     "Host" "httpbin.org" ,
     "User-Agent" "python-requests/2.18.4"
   },
   "origin" "103.215.2.233" ,
   "url" "http://httpbin.org/get?key1=value1&key2=value2"
}
In [ 22 ]:  print (response.url)
http: / / httpbin.org / get?key1 = value1&key2 = value2
In [ 23 ]: payload = { 'key1' : 'value1' , 'key2' :[ 'value2' , 'value3' ]}
In [ 24 ]: response = requests.get( 'http://httpbin.org/get' ,params = payload)
In [ 25 ]:  print (response.url)
http: / / httpbin.org / get?key1 = value1&key2 = value2&key2 = value3
In [ 27 ]: response = requests.get( 'http://api.github.com/events' )
In [ 28 ]: response.encoding               #字符集编码
Out[ 28 ]:  'utf-8'
In [ 29 ]:  print (response.text)   #文件信息
[{ "id" : "6850814749" , "type" : "CreateEvent" , "actor" :{ "id" : 679017 , "login" :......
In [ 30 ]:  print (response.content)         #二进制格式信息
b'[{ "id" : "6850814749" , "type" : "CreateEvent" , "actor" :{ "id" : 679017 , "login" :".....
In [ 34 ]: response.json()
In [ 36 ]: response.status_code            #返回状态码
Out[ 36 ]:  200
In [ 38 ]: headers = 'User-Agent' :'Mozilla / 5.0  (Macintosh; Intel Mac OS X  10_11_6 ) AppleWebKit / 537.36  (KHTML, like Gecko) Chrome / 62.
     ...:  0.3202 . 75  Safari / 537.36 ',' Accept ':' text / html,application / xhtml + xml,application / xml;q = 0.9 ,image / webp,image / apng, * / * ;q = 0.8 '
     ...: , 'Accept-Encoding' : 'gzip, deflate, br' , 'Accept-Language' : 'zh-CN,zh;q=0.9,en;q=0.8' , 'Connection' : 'keep-alive' }
In [ 39 ]: response = requests.get( 'https://api.github.com/events' ,headers = headers)
In [ 40 ]:  print (response.headers)
{ 'Server' 'GitHub.com' 'Date' 'Tue, 14 Nov 2017 06:10:31 GMT' 'Content-Type' 'application/json; charset=utf-8' 'Transfer-Encoding' 'chunked' 'Status' '200 OK' 'X-RateLimit-Limit' '60' 'X-RateLimit-Remaining' '58' 'X-RateLimit-Reset' '1510642339' 'Cache-Control' 'public, max-age=60, s-maxage=60' 'Vary' 'Accept' 'ETag' 'W/"34b51a08c5a8f4fa2400dd5c0d89221b"' 'Last-Modified' 'Tue, 14 Nov 2017 06:10:31 GMT' 'X-Poll-Interval' '60' 'X-GitHub-Media-Type' 'unknown, github.v3' 'Link' '<https://api.github.com/events?page=2>; rel="next", <https://api.github.com/events?page=10>; rel="last"' 'Access-Control-Expose-Headers' 'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval' 'Access-Control-Allow-Origin' '*' 'Content-Security-Policy' "default-src 'none'" 'Strict-Transport-Security' 'max-age=31536000; includeSubdomains; preload' 'X-Content-Type-Options' 'nosniff' 'X-Frame-Options' 'deny' 'X-XSS-Protection' '1; mode=block' 'X-Runtime-rack' '0.104190' 'Content-Encoding' 'gzip' 'X-GitHub-Request-Id' 'D528:C0F5:6BAAA:E4CB6:5A0A88D6' }
In [ 41 ]:
In [ 43 ]:  print (response.headers[ 'Content-Type' ])
application / json; charset = utf - 8
In [ 44 ]:  print (response.headers.get( 'Content-Type' ))
application / json; charset = utf - 8
In [ 45 ]: url = 'http://www.baidu.com'
In [ 46 ]: response = requests.get(url,headers = headers)            #向baidu请求会有cookies返回,有些site没有cookies
In [ 47 ]:  print (response.cookies)                               #输出整个cookies
<RequestsCookieJar[<Cookie H_PS_PSSID = 1425_21088_24880  for  .baidu.com / >, <Cookie BDSVRTM = 0  for  www.baidu.com / >, <Cookie BD_HOME = 0  for  www.baidu.com / >]>
In [ 48 ]:  for  k,v  in  response.cookies.get_dict().items():       #遍历cookies内容
     ...:      print (k,v)
     ...:
H_PS_PSSID  1425_21088_24880
BDSVRTM  0
BD_HOME  0
In [ 49 ]: cookies = { 'c1' : 'v1' , 'c2' : 'v2' }
In [ 50 ]: response = requests.get( 'http://httpbin.org/cookies' ,cookies = cookies)   #携带cookies发送请求
In [ 52 ]:  print (response.text)
{
   "cookies" : {
     "c1" "v1" ,
     "c2" "v2"
   }
}
In [ 53 ]: jar  =  requests.cookies.RequestsCookieJar()
In [ 54 ]: jar. set ( 'tasty_cookie' 'yum' , domain = 'httpbin.org' , path = '/cookies' )
Out[ 54 ]: Cookie(version = 0 , name = 'tasty_cookie' , value = 'yum' , port = None , port_specified = False , domain = 'httpbin.org' , domain_specified = True , domain_initial_dot = False , path = '/cookies' , path_specified = True , secure = False , expires = None , discard = True , comment = None , comment_url = None , rest = { 'HttpOnly' None }, rfc2109 = False )
In [ 55 ]: jar. set ( 'gross_cookie' 'blech' , domain = 'httpbin.org' , path = '/elsewhere' )
Out[ 55 ]: Cookie(version = 0 , name = 'gross_cookie' , value = 'blech' , port = None , port_specified = False , domain = 'httpbin.org' , domain_specified = True , domain_initial_dot = False , path = '/elsewhere' , path_specified = True , secure = False , expires = None , discard = True , comment = None , comment_url = None , rest = { 'HttpOnly' None }, rfc2109 = False )
In [ 56 ]: url  =  'http://httpbin.org/cookies'
In [ 57 ]: response  =  requests.get(url, cookies = jar)
In [ 58 ]:  print (response.text)
{
   "cookies" : {
     "tasty_cookie" "yum"
   }
}

Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests

1
2
3
4
5
6
7
8
9
10
11
12
In [ 62 ]: url = 'http://github.com'
In [ 64 ]: response = requests.get(url,allow_redirects = True )
In [ 65 ]:  print (response.url)
https: / / github.com /
In [ 66 ]: response.history
Out[ 66 ]: [<Response [ 301 ]>]
In [ 69 ]: url  =  'http://httpbin.org/post'
In [ 70 ]: files  =  { 'file' open ( 'test.txt' 'rb' )}
In [ 71 ]: response = requests.post(url,files = files)                  #post提交时携带文件
In [ 72 ]: response.text
Out[ 72 ]:  '...文件的内容...'
In [ 73 ]: response = requests.get( 'https://github.com' , timeout = 5 )    #关于请求超时


import json

import requests

from io import BytesIO

from PIL import Image

#1 处理图片

1
2
3
r = requests.get( 'http://img.jrjimg.cn/2013/11/20131105065502114.jpg' )
image = Image. open (BytesIO(r.content))   #从图片的二进制内容 生成一张图片
image.save( 'mm.jpg' )

#2 Json 处理josn

1
2
3
4
r = requests.get( 'https://github.com/timeline.json' )
print ( type (r.json))
print (r.json)
print (r.text)

#3 org data 处理源数据

1
2
3
4
r = requests.get( 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg' )
with  open ( 'mm2.jpg' , 'wb+' ) as f:
     for  chunk  in  r.iter_content( 1024 ):
         f.write(chunk)

#4 Form 处理表单

1
2
3
4
5
form = { 'username' : 'user' , 'password' : 'pwd' }
r = requests.post( 'http://httpbin.org/post' ,data = form)
print (r.text)
r = requests.post( 'http://httpbin.org/post' ,data = json.dumps(form))
print (r.text)

二、通过Requests抓取豆瓣电影列表及评分

a5a453ee720a9c6b8131ca5a78fdc9fb.jpg

所以抓取代码如下: