http://www.xgezhang.com/python_crawler_jingdong.html
先上我的代码。参考了上面链接的文章
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# -*- coding: utf-8 -*-
# !/usr/bin/python
import
os
import
urllib2
import
urllib
import
cookielib
import
re
import
sys
from
bs4
import
BeautifulSoup
'''
编码方式的设置,在中文使用时用到中文时的处理方式
'''
default_encoding
=
"utf-8"
if
sys.getdefaultencoding() !
=
default_encoding:
reload
(sys)
sys.setdefaultencoding(
"utf-8"
)
def
getHtml(url,data
=
{}):
if
(data
=
=
{}):
req
=
urllib2.Request(url)
else
:
req
=
urllib2.Request(url,urllib.urlencode(data))
html
=
urllib2.urlopen(req).read()
return
html
try
:
cookie
=
cookielib.CookieJar()
cookieProc
=
urllib2.HTTPCookieProcessor(cookie)
except
:
raise
else
:
opener
=
urllib2.build_opener(cookieProc)
opener.addheaders
=
[(
'User-Agent'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
)]
urllib2.install_opener(opener)
auth_url
=
'https://passport.jd.com/uc/loginService'
#auth_url = 'http://www.nowamagic.net/'
home_url
=
'http://usergrade.jd.com/user/consume'
#home_url = 'http://www.nowamagic.net/librarys/nmra/';
url
=
"https://passport.jd.com/uc/login"
login
=
getHtml(url)
#print login
loginSoup
=
BeautifulSoup(login,
'html.parser'
)
#查找登陆参数中的uuid
uuid
=
loginSoup.find_all(
"form"
)[
0
].find_all(
"input"
)[
0
][
'value'
]
print
uuid
clrName
=
loginSoup.find_all(
"form"
)[
0
].find_all(
"input"
)[
6
][
'name'
]
clrValue
=
loginSoup.find_all(
"form"
)[
0
].find_all(
"input"
)[
6
][
'value'
]
'''这俩参数不是必须。。。。
eid=loginSoup.find_all("form")[0].find_all("input")[4]['value']
fp=loginSoup.find_all("form")[0].find_all("input")[5]['value']
'''
#下载验证码图片:
checkPicUrl
=
loginSoup.find_all(
"div"
,
id
=
"o-authcode"
)[
0
].find_all(
"img"
)[
0
][
'src2'
]
req
=
getHtml(checkPicUrl)
checkPic
=
open
(
"checkPic.jpg"
,
"w"
)
checkPic.write(req)
checkPic.close()
#调用mac系统的预览(图像查看器)来打开图片文件
os.system(
'open /Applications/Preview.app/ checkPic.jpg'
)
checkCode
=
raw_input
(
"请输入弹出图片中的验证码:"
)
#登录URL
url
=
"http://passport.jd.com/uc/loginService"
# 登陆用户名和密码
postData
=
{
'loginname'
:
'你自己的账号'
,
'nloginpwd'
:
'你自己的密码'
,
'loginpwd'
:
'你自己的密码'
,
# 'machineNet':'',
# 'machineCpu':'',
# 'machineDisk':'',
str
(clrName):
str
(clrValue),
'uuid'
:uuid,
'authcode'
: checkCode
}
passport
=
getHtml(url,postData)
print
passport
# 初始化一个CookieJar来处理Cookie
'''
cookieJar=cookielib.CookieJar()
# 实例化一个全局opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
# 获取cookie
req=urllib2.Request(auth_url,post_data,headers)
result = opener.open(req)
# 访问主页 自动带着cookie信息
'''
result
=
opener.
open
(
'http://i.jd.com/user/info'
)
# 显示结果
#print result.read()
soup
=
BeautifulSoup(result,
'html.parser'
)
#昵称
nickName
=
soup.find_all(
"input"
,
id
=
"nickName"
)[
0
][
"value"
]
print
"nickName:"
,
print
nickName
|
其实在第一次爬的时候确实是成功返回了{“success”:“http://www.jd.com”}。
但是当我回到寝室再次测试的时候却给我返回了“请刷新页面后重新提交”,暂时还没有解决。
本文转自 努力的C 51CTO博客,原文链接:http://blog.51cto.com/fulin0532/1748590