import requests
from lxml import etree
import os
import time
import random
from PIL import Image
import re
from PyPDF2 import PdfWriter, PdfReader
//代码效果参考:https://www.tvdy.cn/sitemap/post.xml
//代码效果参考:https://www.nbhbjx.cn/sitemap/post.xml
//代码效果参考:https://www.vipwb.com/sitemap/post.xml
//代码效果参考:https://www.257342.com/sitemap/post.xml
//代码效果参考:https://www.uagu.cn/sitemap/post.xml
//代码效果参考:https://www.szworkshop.com.cn/sitemap/post.xml
//代码效果参考:http://www.603393.com/sitemap/post.xml
//代码效果参考:https://www.weibow.com/sitemap/post.xml
//代码效果参考:https://www.xx-ph.com/sitemap/post.xml
//代码效果参考:https://www.h3cw.com/sitemap/post.xml
//代码效果参考:http://www.mwgw.cn/sitemap/post.xml
//代码效果参考:http://www.intpipe.com/sitemap/post.xml
def get_zj_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
}
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
html = response.text
all_doc_page = etree.HTML(html)
# 获取网站标题
title = all_doc_page.xpath('//title/text()')[0]
zj_url = all_doc_page.xpath('//*[@id="chapters_other_list"]/div/a/@href')
zj_title = all_doc_page.xpath('//*[@id="chapters_other_list"]/div/a/div/span/text()')
dir2 = os.path.join(dir1, title)
os.makedirs(dir2, exist_ok=True)
return zj_url,zj_title,dir2
except Exception as e:
print("【请求失败,请检查URL和网络环境!】")
print(e)
def get_chapter(url,title,dir2):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
}
count = 0 # 记录当前重试次数
while True:
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
html = response.text
all_doc_page = etree.HTML(html)
tp_url = all_doc_page.xpath('//ul[@class="comic-contain"]/div/amp-img/@data-src')
#print(tp_url)
if "\\" in title:
title = title.replace("\\", "")
if "?" in title:
title = title.replace("?", "")
download_images(tp_url, dir2 ,title)
jpg2pdf(dir2, title)
#return tp_url
break
except Exception as e:
if count < 1: # 最多重试5次
print("【请求失败,5秒钟后将重新尝试!】")
print(e)
count += 1
time.sleep(5) # 等待5秒后再次发送请求
else:
print("【已达到最大重试次数,请检查URL和网络环境!】")
print(e)
break
def get_chapter_list(url_list,url_title,dir2):
for i , url in enumerate(url_list):
url_a='https://cn.baozimh.com/'+url[1:]
get_chapter(url_a,url_title[i],dir2)
time.sleep(5)
def main():
url = input("请输入链接:")
creat_dir(dir1)
global dir2
# 判断输入是否是 URL 链接
if url.startswith("http://") or url.startswith("https://"):
# 判断链接类型
if "comic" in url:
download_type = input("这是一个目录页,请选择下载类型(1-只下载最新,2-下载全部, 3-从某章节开始):")
if download_type == "1":
print("执行下载最新内容操作!")
# TODO: 实现下载最新内容的代码
zj_url,zj_title,dir2=get_zj_url(url)
get_chapter('https://cn.baozimh.com/'+zj_url[0],zj_title[0],dir2)
#download_images(url_list, folder_name,chapter_name)
elif download_type == "2":
print("执行下载完整内容操作!")
# TODO: 实现下载完整内容的代码
zj_url,zj_title,dir2=get_zj_url(url)
get_chapter_list(zj_url,zj_title,dir2)
elif download_type == "3":
zj_url,zj_title,dir2=mouzj(url)
get_chapter_list(zj_url,zj_title,dir2)
else:
print("输入不正确,请重新运行程序!")
elif "chapter" in url:
print("这是一篇内容页!")
# TODO: 实现下载内容页的代码
tp_url=get_chapter(url)
print(tp_url)
else:
print("这不是一个漫画网站!")
else:
print("输入不是合法的 URL 链接!")
def mouzj(url):
zj_int = input("请输入章节:")
zj_url,zj_title,dir2=get_zj_url(url)
#zj_url,zj_title=zj_url[::-1],zj_title[::-1]#转置
element=f"{zj_int}"
for i, s in enumerate(zj_title):
if element in s: # 判断元素中是否包含字符串 "第24话"
zj_title = zj_title[i:] # 根据索引值获取从该位置开始的所有元素
zj_url = zj_url[i:]
return zj_url,zj_title,dir2
break
else:
print(f"转置数组中不存在元素 {element}")
return
下载图片并保存到指定文件夹中
def download_images(url_list, folder_name, chapter_name):
# 创建文件夹
if not os.path.exists(folder_name):
os.makedirs(folder_name)
for i, url in enumerate(url_list):
count = 0 # 记录重试次数
while count < 3: # 最多重试 3 次
try:
res = requests.get(url)
if res.status_code == 200:
with open(os.path.join(folder_name, chapter_name + '-' + str(i) + '.jpg'), 'wb') as f:
f.write(res.content)
print('下载完成:', os.path.join(folder_name, chapter_name + '-' + str(i) + '.jpg'))
break # 下载成功,退出循环
else:
count += 1 # 下载失败,重试
except Exception as e:
print(e)
count += 1 # 下载失败,重试
# 随机等待 0~2 秒再进行下一次请求
time.sleep(random.uniform(0, 2))
if count == 3:
print('下载失败:', url)
def creat_dir(dir_name):
os.makedirs(dir_name, exist_ok=True)
def jpg2pdf(images_folder, doc_id):
#image_list = []
#for i, file_name in enumerate(os.listdir(images_folder)):
# if file_name.endswith((".jpg", ".png")):
# image_path = os.path.join(images_folder, file_name)
# img = Image.open(image_path)
# if img.mode != "RGB":
# img = img.convert("RGB")
# image_list.append(img)
#os.remove(image_path)
pdf_path = os.path.join(images_folder, f"{doc_id}.pdf")
# 获取所有图片文件的路径
folder_path = images_folder
image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".jpg") or f.endswith(".png")]
image_paths = natural_sort(image_paths)
# 将所有图片合并成一个 PDF 文档
image_list = [Image.open(img) for img in image_paths]
image_list[0].save(pdf_path, "PDF", resolution=100.0, save_all=True, append_images=image_list[1:])
if os.path.exists(pdf_path):
print(f"{doc_id} 【转换为pdf成功!】")
for image_path in [os.path.join(images_folder, file_name) for file_name in os.listdir(images_folder) if file_name.endswith((".jpg", ".png"))]:
os.remove(image_path)
print(f"{doc_id} 【文件夹图片删除!---添加书签】")
add_bookmark_to_pdf(pdf_path, doc_id)
else:
print(f"{doc_id} 【转换失败,请检查!】")
def add_bookmark_to_pdf(pdf_file, bookmark_title):
# 读取PDF文件
pdf_reader = PdfReader(open(pdf_file, 'rb'))
pdf_writer = PdfWriter()
# 添加第一页的书签
first_page = pdf_reader.pages[0]
first_page_title = pdf_writer.add_outline_item(bookmark_title, 0)
pdf_writer.add_page(first_page)
# 将其余页面添加到新的PDF文件中
for page_num in range(1, len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 写入新的PDF文件
with open(pdf_file, 'wb') as out_pdf_file:
pdf_writer.write(out_pdf_file)
print(f"{bookmark_title} 【添加书签成功!】")
def natural_sort(l):
"""
对列表中的元素进行自然排序
"""
convert = lambda text: int(text) if text.isdigit() else text.lower()
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
return sorted(l, key=alphanum_key)
dir1='output'
if name == "main":
main()