最近打算抓取公司内部PPT模板库中的PPT,发现被各种安全屏蔽。因为好久没写程序了,因此写几个例子回顾下基本的爬虫知识
目标网址巴比特的登录页面
http://8btc.com/member.php?mod=logging&action=login
在登录的时候开启fiddler抓包:
图中尖头标记处显示了真正提交表单的地址,将改地址记下:
http://8btc.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=
因为自己想用cookie的方式登录,代码如下:
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
import sys
class ECNcookie:
def __init__(self):
self.url = 'http://8btc.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash='
self.values = {
'formhash' : "284e610a",
'referer' : "http://8btc.com/index.php",
'username' : "xxx",
'loginfield' : "auto",
'password' : "xxx",
'questionid' : "0",
}
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
self.user_headers = {
'User-Agent': self.user_agent
}
self.cookie_dir = 'C:/Users/ecaoyng/Desktop/PPT/cookie.txt'
def cookie_saved(self):
post_data = urllib.parse.urlencode(self.values)
post_data = post_data.encode('utf-8')
cookie = http.cookiejar.MozillaCookieJar(self.cookie_dir)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(self.url, post_data, self.user_headers)
try:
response = opener.open(request)
page = response.read().decode(encoding="GBK")
print(page)
print('='*80)
for i in cookie:
print('Name: %s' % i.name)
print('Value: %s' % i.value)
print('='*80)
cookie.save(ignore_discard=True, ignore_expires=True)
except urllib.error.URLError as e:
print('Error msg: %s' % e.reason)
def access_other_page(self):
try:
my_url = 'http://8btc.com/home.php?mod=space&uid=166072&do=profile&from=space'
cookie = http.cookiejar.MozillaCookieJar()
cookie.load(self.cookie_dir, ignore_discard=True, ignore_expires=True)
get_request = urllib.request.Request(my_url, headers=self.user_headers)
access_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
get_response = access_opener.open(get_request)
print('='*80)
print(get_response.read().decode(encoding="GBK"))
except Exception as e:
print('Error msg when entry other pages: %s' % e.reason())
if __name__ == '__main__':
print(sys.getdefaultencoding())
print('='*80)
cookie_obj=ECNcookie()
cookie_obj.cookie_saved()
cookie_obj.access_other_page()
在抓取的页面中能够查到自己的登录名和状态。
得到的cookie信息如下
Name: eCM1_5408_auth
Value: 6b7dnCekiynTLECh7T%2FcOLMQmTE1JFYd1bmVIrXHb2766l6TjDm3kIFiP%2BS8%2FhmYuV8kmN%2BdOOSZ%2FVrhyJ7TvRLdov8
Name: eCM1_5408_cack_wechat_bind
Value: 1
Name: eCM1_5408_checkfollow
Value: 1
Name: eCM1_5408_lastact
Value: 1512098740%09member.php%09logging
Name: eCM1_5408_lastcheckfeed
Value: 166072%7C1512098740
Name: eCM1_5408_lastvisit
Value: 1512095140
Name: eCM1_5408_lip
Value: 59.46.167.194%2C1512098705
Name: eCM1_5408_saltkey
Value: b5JB4z50
Name: eCM1_5408_sid
Value: LDP536
Name: eCM1_5408_ulastactivity
Value: 83461RS%2BUIpdpkz6fGBx3McFx9MHect60vmuLqMRqpRqCU2aPLMR
Name: PHPSESSID
Value: nm8972h3mlsc9uehh8s2or6cu3
Name: yd_cookie
Value: 5ab8c97f-448d-42306da49cc39d9bf6cbfca9d8a16ccd2db5