在写爬虫的过程中遇到如下错误:
WinError 10061 - No Connection Could be made
解决方法:
1. 打开IE internet options
2. Connections -> Lan Setting
3. 勾上automatically detect settings
封装好的db操作
# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'
import pymysql
import time
class DBOperation:
def __init__(self, tb_name):
self.db_host = 'x'
self.db_port = 3306
self.db_user = 'x'
self.db_pwd = 'x'
self.db_name = 'x'
self.tb_name = tb_name
def get_time(self):
now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
return now_time
'''
set up connection with db
'''
def db_conn(self):
exec_time = self.get_time()
try:
conn = pymysql.connect(host=self.db_host,port=self.db_port,
user=self.db_user,passwd=self.db_pwd,db=self.db_name)
return conn
except Exception as e:
print((u'[%s]: Errors during db connection:%s' % (exec_time, e)))
return None
'''
set up cursor
'''
def db_cursor(self, conn):
try:
cur = conn.cursor()
return cur
except Exception as e:
print(e)
return None
'''
db close
'''
def db_close(self,cur,conn):
exec_time = self.get_time()
cur.close()
conn.close()
print(u'[%s]: db closed' % exec_time)
'''
db operations
'''
def tb_insert_url(self,cur,conn,urls):
exec_time = self.get_time()
tb_exist_sql = """CREATE TABLE IF NOT EXISTS """+ self.tb_name + """ (
URL VARCHAR(200) NOT NULL
)"""
try:
cur.execute(tb_exist_sql)
print(u'[%s]: try to create table %s if not exists.' % (exec_time, self.tb_name))
conn.commit()
sql_insert_url = 'INSERT INTO ' + self.tb_name +' VALUES (%s)'
cur.executemany(sql_insert_url,urls)
conn.commit()
except Exception as e:
print(u'[%s]: Errors during insert into %s:%s' % (exec_time, self.tb_name ,e))
if __name__ == '__main__':
db=DBOperation('ECNSlides')
db_conn = db.db_conn()
db_cur = db.db_cursor(db_conn)
db.db_close(db_cur,db_conn)
下面是爬虫程序
# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'
from ESlides.src.DBOperation import *
import urllib.request
import re
import time
class ESlidesCrawler:
def __init__(self):
self.target_link='https://mediabank.ericsson.net/search/slides/group%20function%20%28gf%29'
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
self.user_headers = {
'User-Agent': self.user_agent,
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept - Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'zh-CN,zh;q=0.8',
'Cookie' : 'PHPSESSID=57i0onm69eei46g6g23ek05tj2',
'Host' : 'mediabank.ericsson.net',
'Referer' : 'https://mediabank.ericsson.net/'
}
self.save_dir = 'C:/Users/ecaoyng/Desktop/PPT/'
'''
get local time
'''
def get_time(self):
now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
return now_time
'''
get page links
'''
def get_page(self):
now_time=self.get_time()
try:
request = urllib.request.Request(self.target_link, headers=self.user_headers)
response = urllib.request.urlopen(request)
pageCode = response.read().decode('utf-8')
return pageCode
except urllib.request.URLError as e:
print(u'%s Errors during connect to target link:%s' % (now_time, e))
return None
'''
get initial target links
'''
def get_links(self):
now_time = self.get_time()
page_code = self.get_page()
if page_code is not None:
page_links = []
try:
pattern = re.compile(
'<li id=.*?>.*?<a href="/media/(.*?)" class="thumb" draggable="true">',re.S)
items = re.findall(pattern, page_code)
for item in items:
item = '%s%s%s' % ('https://mediabank.ericsson.net/details/', item, '/download/original')
page_links.append(item)
return page_links
except Exception as e:
print(u'[%s]: Errors during parser target link:%s' % (now_time, e))
return None
else:
print('page code returns none')
return None
'''
save links into database
'''
def save_links(self):
now_time = self.get_time()
links=self.get_links()
print(links)
try:
if links is not None:
db = DBOperation('ECNSlides')
db_conn = db.db_conn()
db_cur = db.db_cursor(db_conn)
print(u'[%s]: start to urls insert to db' % now_time)
db.tb_insert_url(db_cur, db_conn, links)
print(u'[%s]: write urls insert to db successfully' % now_time)
else:
print(u'[%s]: URL is None when insert to db' % now_time)
pass
finally:
db.db_close(db_cur, db_conn)
'''
download ECN slides with params by http
'''
def slides_download_params(self):
links = self.get_links()
try:
for url in links:
now_time = self.get_time()
file_pattern = re.compile(
'.*?/(\d+)/download/original$',re.S)
file_name = re.findall(file_pattern, url)
file_path = self.save_dir + ''.join(file_name) + '.pptx'
print('Downloading to %s ...' % file_path)
save_file = open(file_path,'wb')
save_file.write(urllib.request.urlopen(url).read())
save_file.close()
# with urllib.request.urlopen(url) as slide:
# with open(file_path, 'wb') as outfile:
# outfile.write(slide.read())
#
# break
except Exception as e:
print(u'[%s]: Errors during download slides: %s.' % (now_time,e))
'''
download ECN slides with remote db
'''
def slides_download_db(self):
pass
if __name__ == '__main__':
crawler=ESlidesCrawler()
# crawler.save_links()
crawler.slides_download_params()
问题出现了,发现在http中敲入下载地址,类似于
https://mediabank.ericsson.net/details/Organization%20simple/83138/download/original
但是python代码中用这个地址返回的不是pptx文件,而是html文件.
要知道具体返回的是什么文件的方法如下:
# reobj=urllib.request.urlopen(url)
# print(type(reobj))
# print(reobj.info())
# print(reobj.getcode())
可以看到正常如果下载的是zip文件,则返回的信息如下:
Content-Type: application/x-zip-compressed
Last-Modified: Mon, 23 May 2016 07:50:56 GMT
Accept-Ranges: bytes
ETag: "0f075d6c7b4d11:0"
Server: Microsoft-IIS/7.5
X-Powered-By: ASP.NET
Date: Wed, 29 Nov 2017 07:07:27 GMT
Connection: close
Content-Length: 55712699
但是本来是ppt文件,却下载了
Cache-Control: no-cache
Pragma: no-cache
Content-Length: 11743
Content-Type: text/html
Expires: Wed, 29 Nov 2017 07:04:04 GMT
Server: Microsoft-IIS/8.0
Set-Cookie: SMTargetSession=HTTPS%3A%2F%2Ffss%2Eericsson%2Ecom%2Fsiteminderagent%2Fredirectjsp%2Fredirect%2Dinternal%2Ejsp%3FSPID%3DMediabankIntern%26RelayState%3Dhttps%253A%252F%252Fmediabank%2Eericsson%2Enet%252Fdetails%252FOrganization%252520simple%252F83138%252Fdownload%252Foriginal%26SMPORTALURL%3Dhttps%253A%252F%252Ffss%2Eericsson%2Ecom%252Faffwebservices%252Fpublic%252Fsaml2sso%26SAMLTRANSACTIONID%3D176beb36%2Dfeb953b6%2D9a53d42e%2D58810506%2D087b72ac%2Da4e3; path=/
Set-Cookie: ASPSESSIONIDACATSTTS=FOLBNEGCIBMFCPILNEMHOHFN; path=/
X-Powered-By: ASP.NET
X-WAM-LOC: LP2-2
Date: Wed, 29 Nov 2017 07:05:04 GMT
Connection: close
Set-Cookie: BIGipServerWAM_PRD_Login=rd423o00000000000000000000ffff9958f466o50001; path=/
Content-Type: text/html 说明是html文件。将其打开之后发现是公司的安全认证页面.
于是开始思索是否可以用cookie的方式来抓取.
(未完待续)