基于lxml.etree实现xpath查找HTML元素
#实践环境
WIN 10
Python 3.6.5
lxml-4.6.2-cp36-cp36m-win_amd64.whl
#实践代码
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree html_str = '''<html> <body> <table width="400" border="1"> <tr> <th align="left">消费项目....</th> <th align="right">一月</th> <th align="right">二月</th> </tr> <tr> <td align="left">衣服</td> <td align="right">$241.10</td> <td align="right">$50.20</td> </tr> <tr> <td align="left">化妆品</td> <td align="right">$30.00</td> <td align="right">$44.45</td> </tr> <tr> <td align="left">食物</td> <td align="right">$730.40</td> <td align="right">$650.00</td> </tr> <tr> <th align="left">总计</th> <th align="right">$1001.50</th> <th align="right">$744.65</th> </tr> </table> </body> </html> ''' root_node = etree.HTML(html_str) # 解析HTML字符串,并返回HTML根结点 print('根节节点名称为:%s' % root_node.tag) # 输出 html # 查找根节点 print(root_node.xpath('/html')) # 输出 <Element html at 0x17245dc8508>] tr_element_list = root_node.xpath("//table/tr[2]/td") # 获取table元素节点下,第二个tr元素节点下的所有td元素 for element in tr_element_list: print(element.tag, element.text) /* for循环输出如下 td 衣服 td $241.10 td $50.20 */ etree.tostring(root_node, encoding='utf-8').decode('utf-8') ## 输出节点内容 second_tr = root_node.xpath('//table/tr[2]')[0] # 获取table元素节点下,第二个tr元素节点 print(etree.tostring(second_tr, encoding='utf-8').decode('utf-8')) ## 输出节点内容 /* <tr> <td align="left">衣服</td> <td align="right">$241.10</td> <td align="right">$50.20</td> </tr> */ # 注意:etree.tostring返回结果为字节对象 print(etree.tostring(second_tr)) ## 输出以下内容 /* b'<tr>\n <td align="left">衣服</td>\n <td align="right">$241.10</td>\n <td align="right">$50.20</td>\n </tr>\n ' */ print(etree.tostring(second_tr).decode('utf-8')) # 输出以下内容 /* <tr> <td align="left">衣服</td> <td align="right">$241.10</td> <td align="right">$50.20</td> </tr> */
参考连接
https://lxml.de/tutorial.html#the-element-class
https://lxml.de/tutorial.html#the-xml-function