文档:
https://lxml.de/lxmlhtml.html#cleaning-up-html
代码示例
# -*- coding: utf-8 -*- from lxml.html.clean import Cleaner html = """ <p cms-style="font-L"> <strong>铁打的腾讯</strong> <a href="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg" class="keyword f_st" target="_blank"> <img src="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg" alt=""> </a> </p> """ # 保存新闻的时候,很多属性不需要保存,不然会占用硬盘资源,所以只保留图片标签的src属性就行 safe_attrs = frozenset(['src']) # a标签也不要,只保留里边的内容 remove_tags = frozenset([ 'a' ]) cleaner = Cleaner(safe_attrs=safe_attrs, remove_tags=remove_tags) cleaned_html = cleaner.clean_html(html) print(cleaned_html) ''' <p> <strong>铁打的腾讯</strong> <img src="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg"> </p> '''