browser.py
def open_in_browser(html): """ Open the HTML document in a web browser, saving it to a temporary file to open it. Note that this does not delete the file after use. This is mainly meant for debugging. """ import os import webbrowser import tempfile # 创建 HTML 临时文件 handle, fn = tempfile.mkstemp(suffix=".html") # 打开 HTML f = os.fdopen(handle, "wb") # 写入 HTML 文本 try: f.write(b"<meta charset='UTF-8' />") f.write(html.encode("utf-8")) finally: # we leak the file itself here, but we should at least close it f.close() # 拼接文件的 URL url = "file://" + fn.replace(os.path.sep, "/") # 让浏览器打开 webbrowser.open(url) return url
cleaner.py
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re from lxml.html.clean import Cleaner # 不良属性 bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] # 匹配单引号包围的文本 single_quoted = "'[^']+'" # 匹配双引号包围的文本 double_quoted = '"[^"]+"' # 匹配非空格和标签结构字符 non_space = "[^ \"'>]+" # 匹配带有不良属性的标签 htmlstrip = re.compile( "<" # open "([^>]+) " # prefix "(?:%s) *" % ("|".join(bad_attrs),) + "= *(?:%s|%s|%s)" # undesirable attributes % (non_space, single_quoted, double_quoted) + "([^>]*)" # value # postfix ">", # end re.I, ) def clean_attributes(html): # 如果发现了不良属性 while htmlstrip.search(html): # 那就把它移除 html = htmlstrip.sub("<\\1\\2>", html) # 直到没有指定属性为止 return html def normalize_spaces(s): # 如果`s`为空返回空串 if not s: return "" # 将连续的空白字符`\s+`替换为单个空格`\x20`并返回 return " ".join(s.split()) # 调用 lxml 库的`Cleaner`创建标签格式化工具 html_cleaner = Cleaner( # 移除`<script>`标签 scripts=True, # 移除`onXXX`属性 javascript=True, # 移除注释节点 comments=True, # 移除`<style>`标签` style=True, # 移除`<link>`标签 links=True, # 不移除`<meta>`标签 meta=False, # 不添加`nofollow`属性 add_nofollow=False, # 不排版`<html> <head> <title>` page_structure=False, # 移除命令节点 processing_instructions=True, # 不移除`<embed>`标签 embedded=False, # 不溢出`<iframe>`标签 frames=False, # 不移除`<form>`标签及控件标签 forms=False, # 不移除'blink', 'marquee'标签 annoying_tags=False, # 没有自定义的移除标签 remove_tags=None, # 不移除未知标签 remove_unknown_tags=False, # 不移除未知属性 safe_attrs_only=False, )
debug.py
import re # FIXME: use with caution, can leak memory uids = {} uids_document = None # 获取节点的描述文本 def describe_node(node): global uids if node is None: return "" # 如果节点没有名称 # 返回占位符 if not hasattr(node, "tag"): return "[%s]" % type(node) name = node.tag # 获取节点 ID 或者类名,转成选择器形式 # 附加在名称之后 if node.get("id", ""): name += "#" + node.get("id") if node.get("class", "").strip(): name += "." + ".".join(node.get("class").split()) # 如果节点是 DIV,并且具有 ID 或者类名 # 从描述中移除 DIV if name[:4] in ["div#", "div."]: name = name[3:] # 如果名称是以下这四个 if name in ["tr", "td", "div", "p"]: # 给节点分配一个自增的 UID,并缓存 uid = uids.get(node) if uid is None: uid = uids[node] = len(uids) + 1 # 在描述后面添加 UID name += "{%02d}" % uid return name # 获取节点的描述文本,带有指定数量的父元素 def describe(node, depth=1): global uids, uids_document # 判断`uids_document`是否是根节点 # 如果不是,清空`uids`和它 doc = node.getroottree().getroot() if doc != uids_document: uids = {} uids_document = doc # return repr(NodeRepr(node)) parent = "" # 判断深度是否为 0 if depth and node.getparent() is not None: # 递归获取父元素的描述文本 parent = describe(node.getparent(), depth=depth - 1) + ">" # 将父元素描述和当前节点描述拼接 return parent + describe_node(node) RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U) # 获取节点的简短内容 def text_content(elem, length=40): # 折叠空白字符,并移除所有 \r content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", "")) # 如果内容长度小鱼限制,直接返回 if len(content) < length: return content # 否则阶段并加省略号 return content[:length] + "..."
encoding.py
import re try: import cchardet except ImportError: import chardet import sys # 匹配三个可能包含编码的标签 # `<meta charset>` `<meta content>` 和 `<?xml ?>` RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') CHARSETS = { "big5": "big5hkscs", "gb2312": "gb18030", "ascii": "utf-8", "maccyrillic": "cp1251", "win1251": "cp1251", "win-1251": "cp1251", "windows-1251": "cp1251", } # 通过查表,将输入编码替换成它的超集 def fix_charset(encoding): """Overrides encoding when charset declaration or charset determination is a subset of a larger charset. Created because of issues with Chinese websites""" encoding = encoding.lower() return CHARSETS.get(encoding, encoding) def get_encoding(page): # Regex for XML and HTML Meta charset declaration # 获取所有包含编码的标签 declared_encodings = ( RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page) ) # Try any declared encodings for declared_encoding in declared_encodings: try: # Python3 only # 如果是 Python3,将字节串转字符串 if sys.version_info[0] == 3: # declared_encoding will actually be bytes but .decode() only # accepts `str` type. Decode blindly with ascii because no one should # ever use non-ascii characters in the name of an encoding. declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) # Now let's decode the page page.decode(encoding) # It worked! return encoding except UnicodeDecodeError: pass # Fallback to chardet if declared encodings fail # Remove all HTML tags, and leave only text for chardet # 如果编码没有声明,尝试用 chardet 猜测 # 移除所有标签 text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip() # 如果长度小鱼 10,无法猜测,返回默认编码 UTF8 enc = 'utf-8' if len(text) < 10: return enc # can't guess # 猜测编码 res = chardet.detect(text) # 如果猜测失败,设为 UTF8 enc = res["encoding"] or "utf-8" # print '->', enc, "%.2f" % res['confidence'] # 修复编码名称 enc = fix_charset(enc) return enc