1.1 介绍
通过过滤和分析HTML代码,实现对文件、图片等资源的获取,一般用到:
urllib和urllib2模块
正则表达式(re模块)
requests模块
Scrapy框架
urllib库:
1)获取web页面
2)在远程http服务器上验证
3)表单提交(GET和POST)
4)异常处理(urllib2.URLError)
5)非http协议通信(ftp)
获取页面信息:
urllib2.urlopen(url,data,timeout)
构造Request:
reques = URLlib.Request(url,data,headers={})
response = urllib2.urlopen(request)
response.read()
1.2 实战1——爬取图片
爬取来源: http://tieba.baidu.com/p/4229162765(百度贴吧)
1)从网页链接源代码中查找数据,用于分析和提取url
需下载的某张图片的url:
<img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/sign=d51025efb5fb43161a1f7a7210a54642/3887e950352ac65cf8452357fcf2b21193138a56.jpg" size="76453" width="400" height="600" style="cursor: url("http://tb2.bdstatic.com/tb/static-pb/img/cur_zin.cur"), pointer;">
2)脚本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
#! /usr/bin/env python
import
urllib,urllib2
import
re
def
getHtml(url):
page
=
urllib2.urlopen(url)
return
page.read()
def
getImage(html):
re_img
=
re.
compile
(r
'<img class="BDE_Image" src="(.*?)".*?'
)
#“(.*?)”问号表示非贪婪模式,匹配到最接近的双引号”,而不加问号则匹配到最后
img_list
=
re_img.findall(html)
i
=
1
for
imgurl
in
img_list:
print
imgurl
urllib.urlretrieve(imgurl,filename
=
"%s.jpg"
%
i)
#urllib.urlliburlretrieve下载,不指定文件名,则保持在当前目录
i
=
i
+
1
if
__name__
=
=
"__main__"
:
url
=
"http://tieba.baidu.com/p/4229162765"
html
=
getHtml(url)
getImage(html)
|
运行结果:
1.3 实战2——爬取文本
爬取来源:https://www.qiushibaike.com/(糗事百科热门段子)
第一页:https://www.qiushibaike.com/
第二页:https://www.qiushibaike.com/8hr/page/2/
第三页:https://www.qiushibaike.com/8hr/page/3/
1)分析网页源码
随机选择一个段子,审查元素,获取:作者、内容、点赞个数url位置,用于定义正则表达式
作者位置:
内容和点赞数位置:
2)代码
>>>>>> 脚本版本一 <<<<<<<<<<
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#!/usr/bin/env python
import
urllib,urllib2
import
re
page
=
1
url
=
"https://www.qiushibaike.com/8hr/page/"
+
str
(page)
headers
=
{
"user-agent"
:
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
#访问正规的网站,一般需要user-agent
try
:
request
=
urllib2.Request(url,headers
=
headers)
response
=
urllib2.urlopen(request)
html
=
response.read()
except
urllib2.URLError,e:
if
hasattr
(e,
"code"
):
#抛出异常时,e表示前面的错误类;判断该类中是否有code属性,有则打印出来
print
e.code
if
hasattr
(e,
"reason"
):
print
e.reason
re_page
=
re.
compile
(r
'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>'
,re.S)
#正则表达式,re.S表示点号可以代表换行符
items
=
re_page.findall(html)
for
item
in
items:
for
i
in
item:
print
i
|
运行结果:
>>>>>>> 更新:脚本版本二 <<<<<<<<<<<
替换掉内容中网页换行符<br/>,然后去掉空格行,显示页数默认为第一页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#!/usr/bin/env python
#coding:utf-8
import
urllib,urllib2
import
re
def
getPage(page_num
=
1
):
url
=
"https://www.qiushibaike.com/8hr/page/"
+
str
(page_num)
headers
=
{
"user-agent"
:
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
try
:
request
=
urllib2.Request(url,headers
=
headers)
response
=
urllib2.urlopen(request)
html
=
response.read()
return
html
except
urllib2.URLError,e:
if
hasattr
(e,
"code"
):
print
(
"连接服务器失败,错误代码 %s"
%
e.code)
return
None
if
hasattr
(e,
"reason"
):
print
(
"连接服务器失败,错误原因 %s"
%
e.reason)
return
None
def
getPageCoent(page_num
=
1
):
html
=
getPage(page_num)
re_page
=
re.
compile
(r
'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>'
,re.S)
items
=
re_page.findall(html)
page_contents
=
[]
#定义空列表,用于存放内容
replaceBR
=
re.
compile
(r
'<br/>'
)
for
item
in
items:
content
=
item[
1
]
content
=
replaceBR.sub(
'\n'
,content)
#替换换行符
page_contents.append([page_num,
item[
0
].strip(),
#删掉空格
content.strip(),
item[
2
].strip()])
return
page_contents
if
__name__
=
=
"__main__"
:
page_content
=
getPageCoent(
1
)
for
item
in
page_content:
for
i
in
item:
print
str
(i)
+
"\n"
|
运行结果:
>>>>>> 继续更新:脚本版本三 <<<<<<<<<
实现交互式爬取
即每按一次enter键,显示一条段子,内容包括:页码、作者、段子内容、点赞数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
#!/usr/bin/env python
#coding:utf-8
import
urllib,urllib2
import
re
import
sys
def
getPage(page_num
=
1
):
url
=
"https://www.qiushibaike.com/8hr/page/"
+
str
(page_num)
headers
=
{
"user-agent"
:
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
try
:
request
=
urllib2.Request(url,headers
=
headers)
response
=
urllib2.urlopen(request)
html
=
response.read()
return
html
except
urllib2.URLError,e:
if
hasattr
(e,
"code"
):
print
(
"连接服务器失败,错误代码 %s"
%
e.code)
return
None
if
hasattr
(e,
"reason"
):
print
(
"连接服务器失败,错误原因 %s"
%
e.reason)
return
None
def
getPageCoent(page_num
=
1
):
html
=
getPage(page_num)
re_page
=
re.
compile
(r
'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>'
,re.S)
items
=
re_page.findall(html)
page_contents
=
[]
replaceBR
=
re.
compile
(r
'<br/>'
)
for
item
in
items:
content
=
item[
1
]
content
=
replaceBR.sub(
'\n'
,content)
page_contents.append([page_num,
item[
0
].strip(),
content.strip(),
item[
2
].strip()])
return
page_contents
def
getOneStory(page_contents):
for
story
in
page_contents:
input
=
raw_input
()
if
input
=
=
"q"
or
input
=
=
"Q"
:
sys.exit()
print
"第%d页\t发布人:%s\t赞:%s\n%s\n"
%
(story[
0
],story[
1
],story[
3
],story[
2
])
if
__name__
=
=
"__main__"
:
print
"正在读取段子,按回车看新段子,按(Q|q)退出"
num
=
1
page_contents
=
getPageCoent(num)
while
True
:
page_contents
=
getPageCoent(num)
getOneStory(page_contents)
num
+
=
1
|
运行结果: