最近闲来无事,拿来练练手。
注:
-
由于网站可能会变动,本代码不保证后面一直都能用,仅讲述抓取的思路;
-
个人纯属研究使用,请不要应用于商业目的;
使用语言:Python
版本:3.4.3
依赖:BeautifulSoup、requests(可以使用pip install进行安装)
代码也比较简单,直接贴上来:
HttpClient.py
1
2
3
4
5
6
7
8
9
10
11
12
|
# -*- coding: utf-8 -*-
import
requests
def
make_request(url):
print
(
'make_request: '
, url)
r
=
requests.get(url, timeout
=
(
30
,
90
))
# if r.status_code == 200:
print
(
'content-type: '
, r.headers[
'content-type'
])
print
(
'encoding: '
, r.encoding)
print
(
'apparent_encoding: '
, r.apparent_encoding)
return
r
|
Kanunu8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
# -*- coding: utf-8 -*-
import
os
import
sys
import
re
import
encodings
#为了引用与父目录同级文件夹中的方法
sys.path.append(
".."
)
# 解决gb2312乱码问题
encodings.aliases.aliases[
'gb2312'
]
=
'gb18030'
from
bs4
import
BeautifulSoup
from
_pyio
import
open
from
util
import
*
book_url
=
''
book_name
=
''
#屏蔽掉作者的链接
writer_link_pattern
=
re.
compile
(r
'.*/writer/\d+\.html'
)
#由于我用的是Windows平台,文件名中不能包含下列非法字符,需要过滤
window_illegal_file_name_pattern
=
re.
compile
(r
'[\\|/|:|\*|\?|"|<|>|\|]'
)
def
find_tbody(tag):
if
tag.name
=
=
'tbody'
:
if
tag.find(
'tbody'
)
is
None
and
tag.find(
'strong'
).string
=
=
'正文'
:
return
True
elif
'发布时间'
in
tag.get_text():
return
True
return
False
def
strong_with_no_href(tag):
return
tag.name
=
=
'strong'
and
tag.a
is
None
and
tag.font
is
not
None
def
find_title(tag):
if
tag.h1
is
not
None
:
return
tag.h1.font.string
elif
tag.h2
is
not
None
:
return
tag.h2.font.string
else
:
return
tag.find(strong_with_no_href).font.string
def
make_soup(html):
# , from_encoding='gb18030'
soup
=
BeautifulSoup(html,
"html.parser"
)
print
(
'original_encoding: '
, soup.original_encoding,
', declared_html_encoding: '
, soup.declared_html_encoding,
', from_encoding: '
, soup.from_encoding)
return
soup
def
get_legal_window_file_name(name):
if
name
is
None
:
return
'unknown'
return
window_illegal_file_name_pattern.sub('', name)
if
__name__
=
=
'__main__'
:
book_url
=
input
(
'请输入电子书URL:'
)
# 按任意键继续
# if input('请按任意键开始抓取...'):
# pass
#获取Html内容
request
=
HttpClient.make_request(book_url)
html
=
request.content
soup
=
make_soup(html)
# 爬取书名
book_name
=
soup.find(
'title'
).string
path
=
'./'
+
get_legal_window_file_name(book_name)
+
'.txt'
links
=
[]
#提取所有章节的链接
for
tmp
in
soup.find_all(
'tbody'
):
if
len
(tmp.find_all(
'tr'
)) >
1
:
all_link
=
tmp.find_all(
'a'
)
if
not
all_link
is
None
:
links.extend(all_link)
if
book_url.endswith(
'.html'
):
parent_url
=
book_url[
0
:book_url.rindex(
'/'
)
+
1
]
else
:
parent_url
=
book_url
with
open
(path,
'w'
, encoding
=
"utf-8"
) as f:
for
link
in
links:
# 作家链接,忽略
if
not
writer_link_pattern.match(link[
'href'
])
is
None
:
continue
print
(
'\n'
, link.string)
url
=
parent_url
+
link[
'href'
]
print
(url)
response
=
HttpClient.make_request(url)
chapter_soup
=
make_soup(response.content)
chapter_name
=
find_title(chapter_soup)
# 章节标题
f.write(
'\n\n'
)
f.write(chapter_name)
f.write(
'\n\n'
)
# 章节内容
f.write(chapter_soup.find(
'p'
).get_text().replace(
'<br/>'
, ''))
# for p in chapter_soup.find('p').contents:
# if p == '<br>':
# f.write('\n')
# elif p is NavigableString:
# f.write(p)
# elif p is Tag:
# f.write(p.string)
f.flush()
print
(
'电子书已成功保存: '
, path)
|
遇到的问题:
-
不同的书(甚至章节)标题内容、字体(h1,h2...)、标签结构都不同;
-
编码问题,抓下来是乱码,具体原因请参考;
应该是为了增加爬取的难度吧,不过只能针对遇到的问题进行分析、解决;
本文转自 breezy_yuan 51CTO博客,原文链接:http://blog.51cto.com/lbrant/1688440,如需转载请自行联系原作者