环境:python2.7
安装lxml模块
1
|
pip
install
lxml
|
例子:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
from
lxml
import
etree
text
=
'''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html
=
etree.HTML(text)
#这是一个地址
result
=
etree.tostring(html)
#读出来源码,并且补全,如输出的《body》标签
print
(result)
|
输出:
1
2
3
4
5
6
7
8
9
10
11
12
13
|
<
html
>
<
body
>
<
div
>
<
ul
>
<
li
class
=
"item-0"
><
a
href
=
"link1.html"
>first item</
a
></
li
>
<
li
class
=
"item-1"
><
a
href
=
"link2.html"
>second item</
a
></
li
>
<
li
class
=
"item-inactive"
><
a
href
=
"link3.html"
>third item</
a
></
li
>
<
li
class
=
"item-1"
><
a
href
=
"link4.html"
>fourth item</
a
></
li
>
<
li
class
=
"item-0"
><
a
href
=
"link5.html"
>fifth item</
a
></
li
>
</
ul
>
</
div
>
</
body
>
</
html
>
|
1
2
3
4
5
|
#读取文件里的内容
from
lxml
import
etree
html
=
etree.parse(
'hello.html'
)
result
=
etree.tostring(html, pretty_print
=
True
)
print
(result)
|
获取li标签里的东西
html = etree.parse('hello.html') print type(html) result = html.xpath('//li') print result print len(result) print type(result) print type(result[0]) |
参考文章:http://cuiqingcai.com/2621.html
说明:此篇博客仅仅是为了自己学习lxml模块,故没好好写,下面是我微信二维码
本文转自 天道酬勤VIP 51CTO博客,原文链接:http://blog.51cto.com/tdcqvip/1976612