项目需要继续爬点昵称~~~先上代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
#coding=utf-8
import
urllib,urllib2
import
re
from
bs4
import
BeautifulSoup
import
time
import
sys
reload
(sys)
sys.setdefaultencoding(
'utf-8'
)
#获取要下载图片的整个页面的信息
def
getHtml(url):
page
=
urllib2.urlopen(url)
html
=
page.read()
# print html
return
html
#筛选数据并打印到本地
def
getImg(html):
soup
=
BeautifulSoup(html,
'html.parser'
)
dls
=
soup.find_all(
'dl'
,attrs
=
{
'class'
:
'feed_list'
})
for
index
in
range
(
len
(dls)):
p
=
dls[index].find_all(
'p'
)[
0
]
print
p.text
f
=
open
(
"nichengnan.txt"
,
"a"
)
for
index
in
range
(
len
(dls)):
nicheng
=
dls[index].find_all(
'p'
)[
0
].text
f.write(nicheng)
f.write(
'\r\n'
)
f.close()
user_agent
=
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12'
headers
=
{
"User-Agent"
:user_agent}
page
=
201
length
=
0
while
page<
231
:
url
=
'http://www.qzone.cc/wangming/fav/list_'
+
str
(page)
+
'.html'
print
"正在爬取第"
+
str
(page)
+
"页......."
# print "这里呢"
request
=
urllib2.Request(url,headers
=
headers)
html
=
getHtml(request)
getImg(html)
page
=
page
+
1
time.sleep(
0.5
)
f
=
open
(
'nichengnan.txt'
,
'r'
)
lines
=
f.readlines()
print
"当前一共"
+
str
(
len
(lines))
+
"条昵称"
f.close()
|
爬的是网址
1
|
http:
/
/
www.qzone.cc
/
wangming
/
day
/
list_1.html
|
跟上一个帖子里基本一样。。。找到规则后直接BS爬取。区别就是这里是手动输入定义要爬取的页码数,不必像那个一样一个网址一个网址复制。不过手动改写网址貌似还是有点麻烦。。。。后面可以通过分析最后一页的“下一页”和前面的有啥区别。稍后看看能不能解决。先这样
本文转自 努力的C 51CTO博客,原文链接:http://blog.51cto.com/fulin0532/1750049