import urllib.request
import random
from bs4 import BeautifulSoup
import time
def request_html(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
return request
def parse_html(html, f):
soup = BeautifulSoup(html, 'lxml')
list_name = soup.select('.book-mulu > ul > li > a')
for item in list_name:
href = 'http://www.shicimingju.com' + item['href']
title = item.text
print('正在下载:-**--%s--**-......' % title)
text = get_text(href)
f.write(title + '\n' + text)
print('结束下载:-**--%s--**-' % title)
time.sleep(random.uniform(0,1))
def get_text(href):
request = request_html(href)
content = urllib.request.urlopen(request).read().decode('utf8')
soup = BeautifulSoup(content, 'lxml')
artist = soup.find('div', class_='chapter_content')
return artist.text
def run():
f = open('两晋演义.txt', 'w', encoding='utf8')
url = 'http://www.shicimingju.com/book/liangjinyanyi.html'
request = request_html(url)
html = urllib.request.urlopen(request).read().decode('utf8')
parse_html(html,f)
f.close()
if __name__ == '__main__':
run()