1、环境:系统Win7 x64,Python 2.7。
2、示例代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
#encoding: utf-8
#author: walker
#date: 2014-11-26
#summary: 使用BeautifulSoup获取url及其内容
import
sys, re, requests, urllib
from
bs4
import
BeautifulSoup
reload
(sys)
sys.setdefaultencoding(
'utf8'
)
#给定关键词,获取百度搜索的结果
def
GetList(keyword):
keyword
=
unicode
(keyword,
'gb18030'
)
dic
=
{
'wd'
: keyword}
urlwd
=
urllib.urlencode(dic)
print
(urlwd)
sn
=
requests.Session()
url
=
'http://www.baidu.com/s?ie=utf-8&csq=1&pstg=22&mod=2&isbd=1&cqid=9c0f47b700036f17&istc=8560&ver=0ApvSgUI_ODaje7cp4DVye9X2LZqWiCPEIS&chk=54753dd5&isid=BD651248E4C31919&'
url
+
=
urlwd
url
+
=
'&ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&rsv_pq=b05765d70003b6c0&rsv_t=ce54Z5LOdER%2Fagxs%2FORKVsCT6cE0zvMTaYpqpgprhExMhsqDACiVefXOze4&_ck=145469.1.129.57.22.735.37'
r
=
sn.get(url
=
url)
soup
=
BeautifulSoup(r.content)
#r.text很可能中文乱码
rtn
=
soup.find(
'div'
,
id
=
'content_left'
).find_all(name
=
'a'
,href
=
re.
compile
(
'baidu.com'
))
for
item
in
rtn:
print
(item.getText().encode(
'gb18030'
))
print
(item[
'href'
])
if
__name__
=
=
'__main__'
:
keyword
=
'正则表达式'
GetList(keyword)
|
3、运行结果截图:
相关阅读:
1、bs4官方文档 。
4、lxml官网
5、BeautifulSoup深度优先遍历:https://www.crummy.com/software/BeautifulSoup/bs4/doc/#descendants
6、BeautifulSoup广度优先遍历:How to do a Breadth First Search easily with beautiful soup?
*** walker * 2014-11-26 ***
本文转自walker snapshot博客51CTO博客,原文链接http://blog.51cto.com/walkerqt/1582703如需转载请自行联系原作者
RQSLT