环境:win10 py37
工具:pyCharm anaconda
主要包:BeautifulSoup,re
代码:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
from urllib import request
from bs4 import BeautifulSoup
html = request.urlopen("http://data.eastmoney.com/report/20181101/APPISWTR4upPASearchReport.html")
bs = BeautifulSoup(html, "html.parser")
print("title")
print(bs.title)
print("meta")
links = bs.find_all("meta")
count = 0
for link in links:
count = count + 1
print(count)
attrs = link.attrs
if "name" in attrs.keys():
print("name:", attrs['name'])
if "http-equiv" in attrs.keys():
print("httpEquiv:", attrs['http-equiv'])
if "content" in attrs.keys():
print("content:", attrs['content'])
print("p")
ps = bs.find_all("p")
index = -1
for p in ps:
contents = p.contents
if len(contents) > 0:
content = contents[0]
if str(content).__contains__("盈利预测"):
index = ps.index(p)
break
needContent = ""
if index != -1:
index = index + 2
needContent = str(ps[index])
print(needContent)
match1 = re.search(r'[\u4e00-\u9fa5]{4}20[0-9]{2}[\u4e00-\u9fa5]-20[0-9]{2}[\u4e00-\u9fa5]', needContent)
match2 = re.search(r'EPS为.*元', needContent)
match3 = re.search(r'([\u4e00-\u9fa5]{4}“).*”[\u4e00-\u9fa5]{2}', needContent)
print(match1.group())
print(match2.group())
print(match3.group())