
爬取该网站所有小说章节

试验成功!!

附带源码:
-- coding: utf-8 --
from urllib.parse import urljoin, urlparse
from lxml import etree
from bs4 import BeautifulSoup
import requests
import sys
import io
import os
解决编码乱码问题
修复 print() 的编码问题
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gbk', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='gbk', errors='replace')
def extract_html_links_bs(url):
headers = {
"user-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Mobile Safari/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
except Exception as e:
print(f"请求失败: {e}")
return []
soup = BeautifulSoup(response.text, 'lxml')
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
clean_href = href.split('#')[0] # 去锚点
absolute_url = urljoin(url, clean_href)
parsed = urlparse(absolute_url)
path = parsed.path
# 判断是否为 HTML 页面
if (path.lower().endswith(('.html', '.htm')) or
path.endswith('/') or
'.' not in path.split('/')[-1]
):
# 排除常见静态资源(可选)
if any(absolute_url.lower().endswith(ext) for ext in ['.jpg', '.png', '.pdf', '.zip']):
continue
links.append(absolute_url)
return list(set(links))
def webneirong(link):
headers = {
"user-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Mobile Safari/537.36"
}
resp = requests.get(link, headers=headers)
p = resp.text.replace('\ufeff', '')
# print(p)
e = etree.HTML(p)
# 获取内容
info = e.xpath('//div[@class="discription"]/text()')
title = e.xpath('//div[@id="bookname"]/text()')
# print(info,len(info))
# 定义书名
file_name = str(title)
# print(file_name, type(file_name))
# 获取固定标签内容
response = requests.get(link)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(p, 'lxml')
# 找出所有 <p> 标签
p_tags = soup.find_all('p')
# print(type(p_tags))
# 使用循环遍历并提取每个 <p> 标签的文本
l = []
# 取出有用的数据
for index, p in enumerate(p_tags):
text = p.get_text(strip=True) # strip=True 去除首尾空白
# print(text)
l.append(str(text))
# print(l)
return file_name, l
def baocunwenjian(file_name, my_list):
desktop = os.path.join(os.path.expanduser("~"), "Desktop")
filename = file_name + ".txt"
filepath = os.path.join(desktop, filename)
# 确保目录存在(桌面肯定存在,但为了通用性)
os.makedirs(desktop, exist_ok=True)
with open(filepath, 'a', encoding='utf-8') as file:
for item in my_list:
file.write(item + '\n')
print(f"✅ 文件已保存到:{filepath}")
if name == "main":
links = extract_html_links_bs()
for link in links:
if '/book/' in link and len(link) == 50:
# print(link)
ceshi = webneirong(link)
# print(len(ceshi))
baocunwenjian(ceshi[0], ceshi[1])