from lxml.html import tostring import lxml.html import re from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding="utf-8") # 将 HTML 文本转为文档树 def build_doc(page): # 如果页面文本是字符串 # 保持原样,不解析编码 if isinstance(page, str_): encoding = None decoded_page = page else: # 否则获取其编码,默认 UTF8 # 将字节串转化为字符串 encoding = get_encoding(page) or "utf-8" decoded_page = page.decode(encoding, "replace") # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters doc = lxml.html.document_fromstring( decoded_page.encode("utf-8", "replace"), parser=utf8_parser ) return doc, encoding # JS风格的正则替换函数 def js_re(src, pattern, flags, repl): # 将替换字符串中的 $ 换成 \\ 然后再执行 re.sub return re.compile(pattern, flags).sub(src, repl.replace("$", "\\")) # 规范化实体 # 将一些 Unicode 字符替换为等价 ASCII 字符 def normalize_entities(cur_title): entities = { u"\u2014": "-", u"\u2013": "-", u"—": "-", u"–": "-", u"\u00A0": " ", u"\u00AB": '"', u"\u00BB": '"', u""": '"', } for c, r in entities.items(): if c in cur_title: cur_title = cur_title.replace(c, r) return cur_title # 规范化标题 = 规范化实体+空白 def norm_title(title): return normalize_entities(normalize_spaces(title)) def get_title(doc): # 获取`<title>`节点 title = doc.find(".//title") # 如果找不到或者没有内容,返回占位符 if title is None or title.text is None or len(title.text) == 0: return "[no-title]" # 规范化标题并返回 return norm_title(title.text) # 获取作者 def get_author(doc): # 获取`<meta name='author'>` author = doc.find(".//meta[@name='author']") # 还是找不到或者内容为空,就返回占位符 if author is None or 'content' not in author.keys() or \ len(author.get('content')) == 0: return "[no-author]" # 返回`content`属性的值 return author.get('content') def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: if text.replace('"', "") in orig.replace('"', ""): collection.add(text) # 正文中标题候选元素的一些 CSS 选择器 TITLE_CSS_HEURISTICS = [ "#title", "#head", "#heading", ".pageTitle", ".news_title", ".title", ".head", ".heading", ".contentheading", ".small_header_red", ] # 获取简短标题 def shorten_title(doc): # 寻找`<title>`节点 title = doc.find(".//title") # 如果没有,或者没有文本,那么返回空串 if title is None or title.text is None or len(title.text) == 0: return "" # 规范化标题 title = orig = norm_title(title.text) # 创建标题候选集 candidates = set() # 对于每个`<h1> <h2> <h3>` for item in [".//h1", ".//h2", ".//h3"]: for e in list(doc.iterfind(item)): # 如果它有内容,就加入候选 if e.text: add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) # 对于每个标题候选元素 for item in TITLE_CSS_HEURISTICS: for e in doc.cssselect(item): # 如果它有内容,就加入候选 if e.text: add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) if candidates: # 如果候选集不为空,取最长元素当做标题 title = sorted(candidates, key=len)[-1] else: # 将文章标题和网站名称分开,类似 # `<title>text title | site name</title>` for delimiter in [" | ", " - ", " :: ", " / "]: # 对于每个分隔符,判断是否包含在标题中 if delimiter in title: # 使用分隔符分割标题 parts = orig.split(delimiter) # 找出标题网站名称前面还是后面 # 如果第一个元素每空格分成四段或者以上 # 就取第一个元素当标题,反之就取最后一个 if len(parts[0].split()) >= 4: title = parts[0] break elif len(parts[-1].split()) >= 4: title = parts[-1] break else: if ": " in title: parts = orig.split(": ") if len(parts[-1].split()) >= 4: title = parts[-1] else: title = orig.split(": ", 1)[1] if not 15 < len(title) < 150: return orig return title # 获取整洁版的正文 # is it necessary? Cleaner from LXML is initialized correctly in cleaners.py def get_body(doc): # 删除`<script>`、`<link>`和`<style>` for elem in doc.xpath(".//script | .//link | .//style"): elem.drop_tree() # 获取文档的`<body>`,如果没有就是文档的根元素,之后获取其 HTML # tostring() always return utf-8 encoded string # FIXME: isn't better to use tounicode? raw_html = tostring(doc.body or doc) # 如果是字节串转换为字符串 if isinstance(raw_html, bytes): raw_html = raw_html.decode() # 把一些不良属性移除 # 但是先删属性后转 HTML 比较好,这就很迷 cleaned = clean_attributes(raw_html) try: # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned except Exception: # FIXME find the equivalent lxml error # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html