import requests from re import findall
#1.正则表达式:年月日
# (1|2)[0-9][0-9][0-9]-(0[1-9]|1[012]|[1-12])-(0[1-9]|1[0-9]|2[0-9]|3[01]|[1-31])
# 2.《红楼梦》五言和七言诗
# 《红楼梦》第一回的网页
#get()里面是第一回的网址
string = requests.get('https://www.xyyuedu.com/gdmz/sidamingzhu/hlmeng/21651.html') \ .text.encode("ISO-8859-1").decode('gbk').encode('utf8').decode('utf8') strings = findall(r'<p>((?:.|\n)*?)</p>', string) str1 = str(strings) str1 = findall(r'(?<=\S)*[\u4e00-\u9fa5]+[?。,:\-;]*(?=\S)*', str1) str2 = "".join(str1) str2 = "".join(str2.split()) print(str2) #string是网页内容字符串
# 五言诗
five = r'[\u4e00-\u9fa5]{5}[,?][\u4e00-\u9fa5]{5}[,?。][\u4e00-\u9fa5]{5}[,?。][\u4e00-\u9fa5]{5}[?。]' fiveCharacter = findall(five, str2) print(fiveCharacter)
# 七言诗
seven = r'[\u4e00-\u9fa5]{7}[,?][\u4e00-\u9fa5]{7}[,?。][\u4e00-\u9fa5]{7}[,?。][\u4e00-\u9fa5]{7}[?。]' sevenCharacter = findall(seven, str2) print(sevenCharacter)