from
BeautifulSoup
import
BeautifulSoup
import
re
doc
=
[
'<html><head><title>Page title</title></head>'
,
'<body><p id="firstpara" align="center">This is paragraph <b>one</b>.'
,
'<p id="secondpara" align="blah">This is paragraph <b>two</b>.'
,
'</html>'
]
soup
=
BeautifulSoup(''.join(doc))
print
soup.prettify()
|
运行结果为:
print
soup.contents[
0
].name
#
print
soup.contents[
0
].contents[
0
].name
for
i
in
range
(
len
(soup.contents[
0
])):
print
soup.contents[
0
].contents[i].name
|
titleTag
=
soup.html.head.title
titleTag
# <title>Page title</title>
titleTag.string
# u'Page title'
len
(soup(
'p'
))
# 2
soup.findAll(
'p'
, align
=
"center"
)
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
soup.find(
'p'
, align
=
"center"
)
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>
soup(
'p'
, align
=
"center"
)[
0
][
'id'
]
# u'firstpara'
soup.find(
'p'
, align
=
re.
compile
(
'^b.*'
))[
'id'
]
# u'secondpara'
soup.find(
'p'
).b.string
# u'one'
soup(
'p'
)[
1
].b.string
# u'two'
|
==============================================================================
本文转自被遗忘的博客园博客,原文链接:http://www.cnblogs.com/rollenholt/archive/2011/12/01/2271298.html,如需转载请自行联系原作者