爬虫学习一-阿里云开发者社区

 
        一、requests模块GET请求
       
        1. 
        无参实例 
       
        import 
        requests 
       
        ret 
        = 
        requests.get(
        'http://www.autohome.com.cn/news/'
        ) 
       
        ret.encoding
        =
        'gbk'    
        #改成中文编码 
       
        print
        (ret.url)    
        #打印url 
       
        print
        (ret.text)    
        #打印文本 
       
        2.
        有参实例 
       
        import 
        requests 
       
        params_dic 
        = 
        { 
       
        'hostid'
        : 
        '10107'
        , 
       
        'elementid'
        : 
        '23'
        , 
       
        'screen'
        : 
        '1'
        , 
       
        'name'
        : 
        '10.28.142.240' 
       
        }
       
        ret 
        = 
        requests.get(url
        =
        'http://zabbix.test.com/screens.php'
        , params
        =
        params_dic) 
       
        print
        (ret.url) 
       
        print
        (ret.text) 
       
        二、requests模块POST请求
       
        1.post
        基本用法 
       
        import 
        requests 
       
        data 
        = 
        { 
       
        'hostid'
        : 
        '10107'
        , 
       
        'elementid'
        : 
        '23'
        , 
       
        'screen'
        : 
        '1'
        , 
       
        'name'
        : 
        '10.28.142.240' 
       
        }
       
        ret 
        = 
        requests.post(' data
        =
        data) 
       
        print
        (ret.text) 
       
        2.post
        传数据 
       
        import 
        requests 
       
        from 
        bs4 
        import 
        BeautifulSoup 
       
        url 
        = 
        'http://zabbix.kuaikuaidai.com:8888/index.php' 
       
        ret 
        = 
        requests.get(url) 
       
        soup 
        = 
        BeautifulSoup(ret.text, 
        'html.parser'
        ) 
       
        tag 
        = 
        soup.find(attrs
        =
        {
        'name'
        : 
        'sid'
        }) 
       
        sid
        =
        tag.attrs[
        'value'
        ] 
       
        tag1 
        = 
        soup.find(attrs
        =
        {
        'name'
        : 
        'form_refresh'
        }) 
       
        form_refresh 
        = 
        tag1.attrs[
        'value'
        ] 
       
        data 
        = 
        { 
       
        'sid'
        : sid, 
       
        'form_refresh'
        : form_refresh, 
       
        'name'
        : 
        'admin'
        , 
       
        'password'
        : 
        'kkdai123'
        , 
       
        'autologin'
        : 
        '1'
        , 
       
        'enter'
        : 
        'Sign in' 
       
        }
       
        headers 
        = 
        {
        'content-type'
        : 
        'text/html'
        } 
       
        zabbix_ret 
        = 
        requests.post(url,data
        =
        data, headers
        =
        headers) 
       
        print
        (zabbix_ret.status_code) 
       
        print
        (zabbix_ret.text) 
       
        3.
        其他请求 
       
        requests.get(url, params
        =
        None
        , 
        *
        *
        kwargs) 
       
        requests.post(url, data
        =
        None
        , json
        =
        None
        , 
        *
        *
        kwargs) 
       
        requests.put(url, data
        =
        None
        , 
        *
        *
        kwargs) 
       
        requests.head(url, 
        *
        *
        kwargs) 
       
        requests.delete(url, 
        *
        *
        kwargs) 
       
        requests.patch(url, data
        =
        None
        , 
        *
        *
        kwargs) 
       
        requests.options(url, 
        *
        *
        kwargs) 
       
        注: 以上方法都是通过requests.request(method, url, 
        *
        *
        kwargs)构建 
       
        三、BeautifulSoup模块解析html文档
       
        #获取汽车之家新闻页面
       
        auto_home 
        = 
        requests.get(url
        =
        'http://www.autohome.com.cn/news/'
        ) 
       
        auto_home.encoding
        =
        'gbk' 
       
        auto_home_news 
        = 
        auto_home.text 
       
        soup 
        = 
        BeautifulSoup(auto_home_news, 
        'html.parser'
        )    
        #返回文档结构对象 
       
        tag 
        = 
        soup.find(name
        =
        'h3'
        )    
        #找到第一个h3标签 
       
        print
        (tag.name)    
        #打印标签名 
       
        1.name
        标签 
       
        tag 
        = 
        soup.find(name
        =
        'h3'
        ) 
       
        name 
        = 
        tag.name 
       
        tag.name 
        = 
        'h4'    
        #设置name 
       
        2.attr
        标签属性 
       
        print
        (tag.attrs)    
        #获取标签属性 
       
        tag.attrs[
        'id'
        ] 
        = 
        '123'    
        #设置标签属性 
       
        3.children
        所有子标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        v 
        = 
        tag.children 
       
        print
        (v) 
       
        4.descendants
        所有子孙标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        v 
        = 
        tag.descendants 
       
        print
        (v) 
       
        5.clear
        清空所有子孙标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        tag.clear()
       
        print
        (tag) 
       
        6.decompose
        递归删除所有标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        tag.decompose()
       
        print
        (tag) 
       
        7.extract
        递归删除所有标签并返回删除的标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        v 
        = 
        tag.extract() 
       
        print
        (v) 
       
        8.decode
        ,转换为字符串（含当前标签）；decode_contents（不含当前标签） 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        v 
        = 
        tag.decode() 
       
        v1 
        = 
        tag.decode_contents() 
       
        print
        (v) 
       
        9.encode
        ,转换为字节（含当前标签）；encode_contents（不含当前标签） 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        v 
        = 
        tag.encode() 
       
        v1 
        = 
        tag.encode_contents() 
       
        print
        (v) 
       
        10.find
        获取匹配的第一个标签 
       
        tag 
        = 
        soup.find(name
        =
        'body'
        ) 
       
        11.find_all
        获取匹配的所有标签 
       
        tag 
        = 
        soup.find_all(name
        =
        'h3'
        ) 
       
        12.has_attr
        判断标签是否有某个属性 
       
        tag 
        = 
        soup.find(name
        =
        'h3'
        ) 
       
        v 
        = 
        tag.has_attr(
        'id'
        ) 
       
        print
        (v) 
       
        13.get_text
        获取标签文本内容 
       
        tag 
        = 
        soup.find(name
        =
        'h3'
        ) 
       
        v 
        = 
        tag.get_text() 
       
        print
        (v) 
       
        14.is_empty_element
        ,是否是空标签(是否可以是空)或者自闭合标签 
       
        # tag = soup.find('br')
       
        # v = tag.is_empty_element
       
        # print(v)
       
        15.
        当前的关联标签 
       
        # soup.next
       
        # soup.next_element
       
        # soup.next_elements
       
        # soup.next_sibling
       
        # soup.next_siblings
       
        #
       
        # tag.previous
       
        # tag.previous_element
       
        # tag.previous_elements
       
        # tag.previous_sibling
       
        # tag.previous_siblings
       
        #
       
        # tag.parent
       
        # tag.parents
       
        16.
        查找某标签的关联标签 
       
        # tag.find_next(...)
       
        # tag.find_all_next(...)
       
        # tag.find_next_sibling(...)
       
        # tag.find_next_siblings(...)
       
        # tag.find_previous(...)
       
        # tag.find_all_previous(...)
       
        # tag.find_previous_sibling(...)
       
        # tag.find_previous_siblings(...)
       
        # tag.find_parent(...)
       
        # tag.find_parents(...)
       
        # 参数同find_all
       
        17.string
        获取修改标签内容 
       
        tag 
        = 
        soup.find(name
        =
        'h3'
        ) 
       
        tag.string 
        = 
        'new content'    
        #设置标签内容 
       
        v 
        = 
        tag.string 
       
        print
        (v) 
       
        18.append
        在当前标签内部追加一个标签 
       
        # tag = soup.find('body')
       
        # tag.append(soup.find('a'))
       
        # print(soup)
       
        #
       
        # from bs4.element import Tag
       
        # obj = Tag(name='i',attrs={'id': 'it'})
       
        # obj.string = '我是一个新来的'
       
        # tag = soup.find('body')
       
        # tag.append(obj)
       
        # print(soup)
       
        19.insert
        在当前标签内部指定位置插入一个标签 
       
        # from bs4.element import Tag
       
        # obj = Tag(name='i', attrs={'id': 'it'})
       
        # obj.string = '我是一个新来的'
       
        # tag = soup.find('body')
       
        # tag.insert(2, obj)
       
        # print(soup)
       
        登录抽屉新热榜点赞
       
        方法一
       
        import 
        requests 
       
        i1 
        = 
        requests.get(url
        =
        'http://dig.chouti.com/'
        ) 
       
        i1_cookies 
        = 
        i1.cookies.get_dict() 
       
        #print(i1_cookies)
       
        i2 
        = 
        requests.post( 
       
        url
        =
        "http://dig.chouti.com/login"
        , 
       
        data
        =
        { 
       
        'phone'
        : 
        '86xxxxxxxxx'
        , 
       
        'password'
        : 
        'xxxxxxx'
        , 
       
        'oneMonth'
        : "", 
       
        }, 
       
        cookies
        =
        i1_cookies 
       
        )
       
        gpsd 
        = 
        i1_cookies[
        'gpsd'
        ] 
       
        i3 
        = 
        requests.post( 
       
        url
        =
        "http://dig.chouti.com/link/vote?linksId=14723416"
        , 
       
        cookies
        =
        {
        'gpsd'
        : gpsd} 
       
        )
       
        print
        (i3.text) 
       
        方法二
       
        import 
        requests 
       
        headers 
        = 
        { 
       
        "Host"
        : 
        "dig.chouti.com"
        , 
       
        "Referer"
        : 
        "http://dig.chouti.com/"
        , 
       
        'User-Agent'
        : 
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'
        , 
       
        'X-Requested-With'
        : 
        'XMLHttpRequest' 
       
        }
       
        session 
        = 
        requests.Session() 
       
        i1 
        = 
        session.get(url
        =
        "http://dig.chouti.com/"
        ,headers
        =
        headers) 
       
        i2 
        = 
        session.post( 
       
        url
        =
        "http://dig.chouti.com/login"
        , 
       
        data
        =
        { 
       
        'phone'
        : 
        "8615726697022"
        , 
       
        'password'
        : 
        'xxxxxxx'
        , 
       
        'oneMonth'
        : "" 
       
        }, 
       
        headers
        =
        headers 
       
        )
       
        i3 
        = 
        session.post( 
       
        url
        =
        "http://dig.chouti.com/link/vote?linksId=14723416"
        ,headers
        =
        headers 
       
        )
       
        print
        (i3.text) 
       
        登录github账号显示项目列表
       
        import 
        requests 
       
        from 
        bs4 
        import 
        BeautifulSoup 
       
        i1 
        = 
        requests.get(
        'https://github.com/login'
        ) 
       
        soup1 
        = 
        BeautifulSoup(i1.text, features
        =
        'lxml'
        ) 
       
        tag 
        = 
        soup1.find(name
        =
        'input'
        , attrs
        =
        {
        'name'
        : 
        'authenticity_token'
        }) 
       
        authenticity_token 
        = 
        tag.get(
        'value'
        ) 
       
        c1 
        = 
        i1.cookies.get_dict() 
       
        i1.close()
       
        form_data 
        = 
        { 
       
        "authenticity_token"
        : authenticity_token, 
       
        'utf8'
        : "", 
       
        'commit'
        : 
        "Sign in"
        , 
       
        'login'
        : 
        '758109577@qq.com'
        , 
       
        'password'
        : 
        'xxxxxxxx' 
       
        }
       
        i2 
        = 
        requests.post(
        'https://github.com/session'
        , data
        =
        form_data, cookies
        =
        c1) 
       
        c2 
        = 
        i2.cookies.get_dict() 
       
        c1.update(c2)
       
        i3 
        = 
        requests.get(
        'https://github.com/settings/repositories'
        , cookies
        =
        c1) 
       
        soup3 
        = 
        BeautifulSoup(i3.text, features
        =
        'lxml'
        ) 
       
        list_group 
        = 
        soup3.find(name
        =
        'div'
        , 
        class_
        =
        'listgroup'
        ) 
       
        from 
        bs4.element 
        import 
        Tag 
       
        for 
        child 
        in 
        list_group.children: 
       
        if 
        isinstance
        (child, Tag): 
       
        project_tag 
        = 
        child.find(name
        =
        'a'
        , 
        class_
        =
        'mr-1'
        ) 
       
        size_tag 
        = 
        child.find(name
        =
        'small'
        ) 
       
        temp 
        = 
        "项目: %s(%s); 项目路径: %s" 
        % 
        (project_tag.get(
        'href'
        ), size_tag.string, project_tag.string, ) 
       
        print
        (temp) 
       
        方法二
       
        import 
        requests 
       
        from 
        bs4 
        import 
        BeautifulSoup 
       
        session 
        = 
        requests.Session() 
       
        i1 
        = 
        session.get(
        'https://github.com/login'
        ) 
       
        soup1 
        = 
        BeautifulSoup(i1.text,features
        =
        'lxml'
        ) 
       
        tag 
        = 
        soup1.find(name
        =
        'input'
        , attrs
        =
        {
        'name'
        : 
        'authenticity_token'
        }) 
       
        authenticity_token 
        = 
        tag.get(
        'value'
        ) 
       
        c1 
        = 
        i1.cookies.get_dict() 
       
        i1.close()
       
        form_data 
        = 
        { 
       
        "authenticity_token"
        : authenticity_token, 
       
        'utf8'
        : "", 
       
        'commit'
        : 
        "Sign in"
        , 
       
        'login'
        : 
        '758109577@qq.com'
        , 
       
        'password'
        : 
        'xxxxxxxxxx' 
       
        }
       
        i2 
        = 
        session.post(
        'https://github.com/session'
        , data
        =
        form_data) 
       
        c2 
        = 
        i2.cookies.get_dict() 
       
        c1.update(c2)
       
        i3 
        = 
        session.get(
        'https://github.com/settings/repositories'
        ) 
       
        soup3 
        = 
        BeautifulSoup(i3.text, features
        =
        'lxml'
        ) 
       
        list_group 
        = 
        soup3.find(name
        =
        'div'
        , 
        class_
        =
        'listgroup'
        ) 
       
        from 
        bs4.element 
        import 
        Tag 
       
        for 
        child 
        in 
        list_group.children: 
       
        if 
        isinstance
        (child, Tag): 
       
        project_tag 
        = 
        child.find(name
        =
        'a'
        , 
        class_
        =
        'mr-1'
        ) 
       
        size_tag 
        = 
        child.find(name
        =
        'small'
        ) 
       
        temp 
        = 
        "项目: %s(%s); 项目路径: %s" 
        % 
        (project_tag.get(
        'href'
        ), size_tag.string, project_tag.string, ) 
       
        print
        (temp) 
       
        知乎登录
       
        #!/usr/bin/env python
       
        # -*- coding:utf-8 -*-
       
        import 
        time 
       
        import 
        requests 
       
        from 
        bs4 
        import 
        BeautifulSoup 
       
        session 
        = 
        requests.Session() 
       
        i1 
        = 
        session.get( 
       
        url
        =
        'https://www.zhihu.com/#signin'
        , 
       
        headers
        =
        { 
       
        'User-Agent'
        : 
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
        , 
       
        } 
       
        )
       
        soup1 
        = 
        BeautifulSoup(i1.text, 
        'lxml'
        ) 
       
        xsrf_tag 
        = 
        soup1.find(name
        =
        'input'
        , attrs
        =
        {
        'name'
        : 
        '_xsrf'
        }) 
       
        xsrf 
        = 
        xsrf_tag.get(
        'value'
        ) 
       
        current_time 
        = 
        time.time() 
       
        i2 
        = 
        session.get( 
       
        url
        =
        'https://www.zhihu.com/captcha.gif'
        , 
       
        params
        =
        {
        'r'
        : current_time, 
        'type'
        : 
        'login'
        }, 
       
        headers
        =
        { 
       
        'User-Agent'
        : 
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
        , 
       
        }) 
       
        with 
        open
        (
        'zhihu.gif'
        , 
        'wb'
        ) as f: 
       
        f.write(i2.content) 
       
        captcha 
        = 
        input
        (
        '请打开zhihu.gif文件，查看并输入验证码：'
        ) 
       
        form_data 
        = 
        { 
       
        "_xsrf"
        : xsrf, 
       
        'password'
        : 
        'xxxxxx'
        , 
       
        "captcha"
        : captcha, 
       
        'email'
        : 
        '7xxxxx@qq.com' 
       
        }
       
        i3 
        = 
        session.post( 
       
        url
        =
        'https://www.zhihu.com/login/email'
        , 
       
        data
        =
        form_data, 
       
        headers
        =
        { 
       
        'User-Agent'
        : 
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
        , 
       
        } 
       
        )
       
        i4 
        = 
        session.get( 
       
        url
        =
        'https://www.zhihu.com/settings/profile'
        , 
       
        headers
        =
        { 
       
        'User-Agent'
        : 
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
        , 
       
        } 
       
        )
       
        soup4 
        = 
        BeautifulSoup(i4.text, 
        'lxml'
        ) 
       
        tag 
        = 
        soup4.find(
        id
        =
        'rename-section'
        ) 
       
        nick_name 
        = 
        tag.find(
        'span'
        ,
        class_
        =
        'name'
        ).string 
       
        print
        (nick_name)

 
        本文转自小白的希望 51CTO博客，原文链接：http://blog.51cto.com/haoyonghui/1972754
   ，如需转载请自行联系原作者

爬虫学习一

热门文章

最新文章

相关课程

相关电子书

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

爬虫学习一

热门文章

最新文章

相关课程

相关电子书