python3 爬去公司内部的ppt资料

简介: 在写爬虫的过程中遇到如下错误:WinError 10061 - No Connection Could be made解决方法: 1.

在写爬虫的过程中遇到如下错误:

WinError 10061 - No Connection Could be made

解决方法:

 1. 打开IE internet options
 2. Connections -> Lan Setting
 3. 勾上automatically detect settings

封装好的db操作

# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'

import pymysql
import time

class DBOperation:

    def __init__(self, tb_name):
        self.db_host = 'x'
        self.db_port = 3306
        self.db_user = 'x'
        self.db_pwd = 'x'
        self.db_name = 'x'
        self.tb_name = tb_name

    def get_time(self):
        now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return now_time
    '''
    set up connection with db
    '''
    def db_conn(self):
        exec_time = self.get_time()
        try:
            conn = pymysql.connect(host=self.db_host,port=self.db_port,
                                   user=self.db_user,passwd=self.db_pwd,db=self.db_name)
            return conn
        except Exception as e:
            print((u'[%s]: Errors during db connection:%s' % (exec_time, e)))
            return None
    '''
    set up cursor
    '''
    def db_cursor(self, conn):
        try:
            cur = conn.cursor()
            return cur
        except Exception as e:
            print(e)
            return None

    '''
    db close
    '''
    def db_close(self,cur,conn):
        exec_time = self.get_time()
        cur.close()
        conn.close()
        print(u'[%s]: db closed' % exec_time)



    '''
    db operations
    '''
    def tb_insert_url(self,cur,conn,urls):
        exec_time = self.get_time()
        tb_exist_sql = """CREATE TABLE IF NOT EXISTS """+ self.tb_name + """ (
             URL  VARCHAR(200) NOT NULL
             )"""
        try:
            cur.execute(tb_exist_sql)
            print(u'[%s]: try to create table %s if not exists.' % (exec_time, self.tb_name))
            conn.commit()

            sql_insert_url = 'INSERT INTO ' + self.tb_name +' VALUES (%s)'
            cur.executemany(sql_insert_url,urls)
            conn.commit()
        except Exception as e:
            print(u'[%s]: Errors during insert into %s:%s' % (exec_time, self.tb_name ,e))


if __name__ == '__main__':

    db=DBOperation('ECNSlides')
    db_conn = db.db_conn()
    db_cur = db.db_cursor(db_conn)
    db.db_close(db_cur,db_conn)

下面是爬虫程序

# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'

from ESlides.src.DBOperation import *
import urllib.request
import re
import time


class ESlidesCrawler:
    def __init__(self):
        self.target_link='https://mediabank.ericsson.net/search/slides/group%20function%20%28gf%29'
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        self.user_headers = {
            'User-Agent': self.user_agent,
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept - Encoding' : 'gzip, deflate, br',
            'Accept-Language' : 'zh-CN,zh;q=0.8',
            'Cookie' : 'PHPSESSID=57i0onm69eei46g6g23ek05tj2',
            'Host' : 'mediabank.ericsson.net',
            'Referer' : 'https://mediabank.ericsson.net/'

        }
        self.save_dir = 'C:/Users/ecaoyng/Desktop/PPT/'

    '''
    get local time
    '''
    def get_time(self):
        now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return now_time
    '''
    get page links
    '''
    def get_page(self):
        now_time=self.get_time()
        try:
            request = urllib.request.Request(self.target_link, headers=self.user_headers)
            response = urllib.request.urlopen(request)
            pageCode = response.read().decode('utf-8')
            return  pageCode
        except urllib.request.URLError as e:
            print(u'%s Errors during connect to target link:%s' % (now_time, e))
            return None
    '''
    get initial target links
    '''
    def get_links(self):
        now_time = self.get_time()
        page_code = self.get_page()
        if page_code is not None:
            page_links = []
            try:
                pattern = re.compile(
                    '<li id=.*?>.*?<a href="/media/(.*?)" class="thumb" draggable="true">',re.S)
                items = re.findall(pattern, page_code)
                for item in items:
                    item = '%s%s%s' % ('https://mediabank.ericsson.net/details/', item, '/download/original')
                    page_links.append(item)
                return page_links
            except Exception as e:
                print(u'[%s]: Errors during parser target link:%s' % (now_time, e))
                return None
        else:
            print('page code returns none')
            return None
    '''
    save links into database
    '''
    def save_links(self):
        now_time = self.get_time()
        links=self.get_links()
        print(links)
        try:
            if links is not None:
                db = DBOperation('ECNSlides')
                db_conn = db.db_conn()
                db_cur = db.db_cursor(db_conn)
                print(u'[%s]: start to urls insert to db' % now_time)
                db.tb_insert_url(db_cur, db_conn, links)
                print(u'[%s]: write urls insert to db successfully' % now_time)
            else:
                print(u'[%s]: URL is None when insert to db' % now_time)
                pass
        finally:
            db.db_close(db_cur, db_conn)

    '''
    download ECN slides with params by http
    '''
    def slides_download_params(self):

        links = self.get_links()
        try:
            for url in links:
                now_time = self.get_time()
                file_pattern = re.compile(
                    '.*?/(\d+)/download/original$',re.S)
                file_name = re.findall(file_pattern, url)
                file_path = self.save_dir + ''.join(file_name) + '.pptx'

                print('Downloading to %s ...' % file_path)

                save_file = open(file_path,'wb')
                save_file.write(urllib.request.urlopen(url).read())
                save_file.close()


                # with urllib.request.urlopen(url) as slide:
                #     with open(file_path, 'wb') as outfile:
                #         outfile.write(slide.read())
                #
                #     break
        except Exception as e:
            print(u'[%s]: Errors during download slides: %s.' % (now_time,e))






    '''
    download ECN slides with remote db
    '''
    def slides_download_db(self):
        pass













if __name__ == '__main__':
    crawler=ESlidesCrawler()
    # crawler.save_links()
    crawler.slides_download_params()

问题出现了,发现在http中敲入下载地址,类似于

https://mediabank.ericsson.net/details/Organization%20simple/83138/download/original

但是python代码中用这个地址返回的不是pptx文件,而是html文件.
要知道具体返回的是什么文件的方法如下:

# reobj=urllib.request.urlopen(url)
# print(type(reobj))
# print(reobj.info())
# print(reobj.getcode())

可以看到正常如果下载的是zip文件,则返回的信息如下:

Content-Type: application/x-zip-compressed
Last-Modified: Mon, 23 May 2016 07:50:56 GMT
Accept-Ranges: bytes
ETag: "0f075d6c7b4d11:0"
Server: Microsoft-IIS/7.5
X-Powered-By: ASP.NET
Date: Wed, 29 Nov 2017 07:07:27 GMT
Connection: close
Content-Length: 55712699

但是本来是ppt文件,却下载了

Cache-Control: no-cache
Pragma: no-cache
Content-Length: 11743
Content-Type: text/html
Expires: Wed, 29 Nov 2017 07:04:04 GMT
Server: Microsoft-IIS/8.0
Set-Cookie: SMTargetSession=HTTPS%3A%2F%2Ffss%2Eericsson%2Ecom%2Fsiteminderagent%2Fredirectjsp%2Fredirect%2Dinternal%2Ejsp%3FSPID%3DMediabankIntern%26RelayState%3Dhttps%253A%252F%252Fmediabank%2Eericsson%2Enet%252Fdetails%252FOrganization%252520simple%252F83138%252Fdownload%252Foriginal%26SMPORTALURL%3Dhttps%253A%252F%252Ffss%2Eericsson%2Ecom%252Faffwebservices%252Fpublic%252Fsaml2sso%26SAMLTRANSACTIONID%3D176beb36%2Dfeb953b6%2D9a53d42e%2D58810506%2D087b72ac%2Da4e3; path=/
Set-Cookie: ASPSESSIONIDACATSTTS=FOLBNEGCIBMFCPILNEMHOHFN; path=/
X-Powered-By: ASP.NET
X-WAM-LOC: LP2-2
Date: Wed, 29 Nov 2017 07:05:04 GMT
Connection: close
Set-Cookie: BIGipServerWAM_PRD_Login=rd423o00000000000000000000ffff9958f466o50001; path=/

Content-Type: text/html 说明是html文件。将其打开之后发现是公司的安全认证页面.

于是开始思索是否可以用cookie的方式来抓取.
(未完待续)

目录
相关文章
|
8月前
|
Python
Python 采集109个中国风风格PPT
Python 采集109个中国风风格PPT
71 3
|
8月前
|
大数据 Python
Python 采集87个手绘风格PPT模板
Python 采集87个手绘风格PPT模板
86 1
|
8月前
|
Python
Python 采集77个教学课件PPT模板
Python 采集77个教学课件PPT模板
81 0
|
3月前
|
自然语言处理 数据处理 Python
python操作和解析ppt文件 | python小知识
本文将带你从零开始,了解PPT解析的工具、工作原理以及常用的基本操作,并提供具体的代码示例和必要的说明【10月更文挑战第4天】
537 60
|
8月前
|
Python
Python 将PowerPoint (PPT/PPTX) 转为HTML
使用Python将PowerPoint转换为HTML以适应网络分享。需安装`Spire.Presentation for Python`库,通过`pip install Spire.Presentation`。示例包括:1) 全部转换,使用`Presentation.SaveToFile()`方法;2) 转换特定幻灯片,通过`Presentation.Slides[]`获取幻灯片再保存。代码示例展示了具体操作步骤。
127 6
|
6月前
|
存储 程序员 Python
小白也能用的代码!1行Python,把PPT转成1张长图
大家好,我是程序员晚枫。今天介绍`python-office`库的新功能:仅用1行Python代码将PPT转为单张长图。
89 11
 小白也能用的代码!1行Python,把PPT转成1张长图
|
5月前
|
Python
Python——将PPT和Word转为PDF文件
Python——将PPT和Word转为PDF文件
90 1
|
5月前
|
机器学习/深度学习 数据采集 数据可视化
【python】python母婴数据分析模型预测可视化(数据集+论文+PPT+源码)【独一无二】
【python】python母婴数据分析模型预测可视化(数据集+论文+PPT+源码)【独一无二】
|
5月前
|
数据可视化 搜索推荐 定位技术
25页PPT | 如何利用python进行地图可视化?
25页PPT | 如何利用python进行地图可视化?
|
5月前
|
数据采集 数据挖掘 大数据
47页PPT | 如何利用Python进行自动化办公?
47页PPT | 如何利用Python进行自动化办公?