1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
urllib模块解析编码url参数
from
urllib
import
parse
postdata
=
{
'a'
:
1
,
'b'
:
2
}
data
=
parse.urlencode(postdata)
print
(data)
windows下爬虫脚本必须配置以下内容,否则出现编码错误
import
sys,io
sys.stdout
=
io.TextIOWrapper(sys.stdout.
buffer
,encoding
=
'gb18030'
)
一、爬取煎蛋网内容
items.py
#数据字段
import
scrapy
class
JiandanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy.Field()
content
=
scrapy.Field()
img_url
=
scrapy.Field()
爬虫脚本jiandan.py
import
sys,io
sys.stdout
=
io.TextIOWrapper(sys.stdout.
buffer
,encoding
=
'gb18030'
)
import
scrapy
from
..items
import
JiandanItem
from
scrapy.selector
import
HtmlXPathSelector
class
JianDanSpider(scrapy.Spider):
name
=
"jiandan"
allowed_domains
=
[
"jandan.net"
]
start_urls
=
[
"http://jandan.net/"
,
]
def
parse(
self
, response):
#title_list = response.xpath('//div[@class="indexs"]//h2/a/text()').extract()
hxs
=
HtmlXPathSelector(response)
items
=
hxs.select(
'//div[@class="post f list-post"]'
)
for
item
in
items:
img_url
=
item.select(
'.//div[@class="thumbs_b"]/a/img/@data-original'
).extract_first()
if
not
img_url:
img_url
=
item.select(
'.//div[@class="thumbs_b"]/a/img/@src'
).extract_first()
img_url
=
img_url.strip(
"/"
)
img_url
=
"http://"
+
img_url
title
=
item.select(
'.//div[@class="indexs"]/h2/a/text()'
).extract_first()
content
=
item.select(
'.//div[@class="indexs"]/text()'
).extract()[
3
]
content
=
content.strip()
obj
=
JiandanItem(title
=
title, img_url
=
img_url, content
=
content)
yield
obj
pipelines.py
#数据存储脚本
import
json
import
os
import
requests
# class JiandanPipeline(object):
# def process_item(self, item, spider):
# return item
class
JsonPipeline(
object
):
#items.py设置的字段存储方式
def
__init__(
self
):
self
.
file
=
open
(
'jiandan.txt'
,
'w'
)
def
process_item(
self
, item, spider):
v
=
json.dumps(
dict
(item), ensure_ascii
=
False
)
self
.
file
.write(v)
self
.
file
.write(
'\n'
)
self
.
file
.flush()
return
item
class
FilePipeline(
object
):
#图片url存储方式
def
__init__(
self
):
if
not
os.path.exists(
'imgs'
):
os.makedirs(
'imgs'
)
def
process_item(
self
, item, spider):
response
=
requests.get(item[
'img_url'
],stream
=
True
)
with
open
(
'1.jpg'
, mode
=
'wb'
) as f:
f.write(response.content)
return
item
settings.py
#设置存储
ITEM_PIPELINES
=
{
'jiandan.pipelines.JsonPipeline'
:
100
,
'jiandan.pipelines.FilePipeline'
:
300
,
}
命令行输出json文件: scrapy crawl jiandan
-
o items.json
|
本文转自小白的希望 51CTO博客,原文链接:http://blog.51cto.com/haoyonghui/1976840
,如需转载请自行联系原作者