爬取新闻新浪页面
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
|
items.py
import
scrapy
class
SinaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class
SinanewsItem(scrapy.Item):
#大类的标题和url
parentTitle
=
scrapy.Field()
parentUrls
=
scrapy.Field()
#小类的标题和子url
subTitle
=
scrapy.Field()
subUrls
=
scrapy.Field()
#小类目录存储路径
subFilename
=
scrapy.Field()
#小类下的子链接
sonUrl
=
scrapy.Field()
#文章标题和内容
head
=
scrapy.Field()
content
=
scrapy.Field()
爬虫脚本sina01.py
# -*- coding: utf-8 -*-
import
scrapy
from
..items
import
SinanewsItem
from
scrapy_redis.spiders
import
RedisSpider
class
Sina01Spider(RedisSpider):
name
=
'sina01'
#启动爬虫时的命令
redis_key
=
"sinaspider:start_urls"
#allowed_domains = ['sina.com']
# start_urls = ['http://sina.com/']
#动态定义爬虫取域范围
def
__init__(
self
,
*
args,
*
*
kwargs):
domain
=
kwargs.pop(
'domain'
, '')
self
.allowed_domains
=
filter
(
None
, domain.split(
','
))
super
(Sina01Spider,
self
).__init__(
*
args,
*
*
kwargs)
def
parse(
self
, response):
items
=
[]
#所有大类的url和标题
parentUrls
=
response.xpath(
'//div[@id="tab01"]/div/h3/a/@href'
).extract()
parentTitle
=
response.xpath(
'//div[@id="tab01"]/div/h3/a/text()'
).extract()
#所有小类的url和标题
subUrls
=
response.xpath(
'//div[@id="tab01"]/div/ul/li/a/@href'
).extract()
subTitle
=
response.xpath(
'//div[@id="tab01"]/div/ul/li/a/text()'
).extract()
#爬取所有大类
for
i
in
range
(
0
,
len
(parentTitle)):
#爬取所有小类
for
j
in
range
(
0
,
len
(subUrls)):
item
=
SinanewsItem()
#保存大类的title和urls
item[
'parentTitle'
]
=
parentTitle[i]
item[
'parentUrls'
]
=
parentUrls[i]
#检查小类的url是否以同类别大类url开头,如果是返回True
if_belong
=
subUrls[j].startswith(item[
'parentUrls'
])
if
(if_belong):
# 存储小类url、title和filename字段数据
item[
'subUrls'
]
=
subUrls[j]
item[
'subTitle'
]
=
subTitle[j]
items.append(item)
# 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
for
item
in
items:
yield
scrapy.Request(url
=
item[
'subUrls'
], meta
=
{
'meta_1'
:item}, callback
=
self
.second_parse)
#对于返回的小类的url,在进行递归请求
def
second_parse(
self
, response):
#提取每次Response的meta数据
meta_1
=
response.meta[
'meta_1'
]
#取出小类的所有子链接
sonUrls
=
response.xpath(
'//a/@href'
).extract()
items
=
[]
for
i
in
range
(
0
,
len
(sonUrls)):
#检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
if_belong
=
sonUrls[i].endswith(
'.shtml'
)
and
sonUrls[i].startwith(meta_1[parentUrls])
# 如果属于本大类,获取字段值放在同一个item下便于传输
if
(if_belong):
item
=
SinanewsItem()
item[
'parentTitle'
]
=
meta_1[
'parentTitle'
]
item[
'parentUrls'
]
=
meta_1[
'parentUrls'
]
item[
'subUrls'
]
=
meta_1[
'subUrls'
]
item[
'subTitle'
]
=
meta_1[
'subTitle'
]
item[
'sonUrls'
]
=
sonUrls[i]
items.append(item)
# 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
for
item
in
items:
yield
scrapy.Request(url
=
item[
'sonUrls'
],meta
=
{
'meta_2'
:item},callback
=
self
.detail_parse)
#数据解析方法,获取文章标题和内容
def
detail_parse(
self
, response):
item
=
response.meta[
'meta_2'
]
content
=
""
head
=
response.xpath(
'/h1[@id="main_title"]/text()'
)
content_list
=
response.xpath(
'//div[@id="artibody"]/p/text()'
).extract()
#将p标签里的文本内容 合并到一起
for
content_one
in
content_list:
content
+
=
content_one
item[
'head'
]
=
head[
0
]
if
len
(head) >
0
else
"NULL"
item[
'content'
]
=
content
yield
item
settings.py添加
# 使用scrapy-redis里的去重组件,不使用scrapy默认的去重方式
DUPEFILTER_CLASS
=
"scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis里的调度器组件,不使用默认的调度器
SCHEDULER
=
"scrapy_redis.scheduler.Scheduler"
# 允许暂停,redis请求记录不丢失
SCHEDULER_PERSIST
=
True
# 默认的scrapy-redis请求队列形式(按优先级)
SCHEDULER_QUEUE_CLASS
=
"scrapy_redis.queue.SpiderPriorityQueue"
# 队列形式,请求先进先出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# 栈形式,请求先进后出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
# 只是将数据放到redis数据库,不需要写pipelines文件
ITEM_PIPELINES
=
{
# 'Sina.pipelines.SinaPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline'
:
400
,
}
# LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY
=
1
# 指定数据库的主机IP
REDIS_HOST
=
"localhost"
# 指定数据库的端口号
REDIS_PORT
=
6379
打开redis客户端添加url测试
lpush sinaspider:start_urls http:
/
/
news.sina.com.cn
/
guide
/
|
本文转自小白的希望 51CTO博客,原文链接:http://blog.51cto.com/haoyonghui/1978095
,如需转载请自行联系原作者