开发者社区> 问答> 正文

街拍图片网站抓取代码记录

# -*- coding:utf-8 -*-


import requests
from urllib  import urlencode
import urllib
from requests.exceptions import RequestException
import re
import os




def get_info(offset,keyword):
    data =  {'offset':offset,
             'format':'json',
             'keyword':keyword,
             'autoload':'true',
             'count':'20',
             'cur_tab':'1'}
    url = 'http://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        response=requests.get(url)
        if response.status_code == 200:
           json=response.json()
           for item in json.get('data'):
                articl_url=item.get('article_url')
                html=requests.get(articl_url).text
                reg=r'img src="(.*?)"'
                imag_url=re.findall(reg,html)
                if len(imag_url) != 0:
                    for url in imag_url:
                        global num
                        urllib.urlretrieve(url, 'jiepai/%d.jpg' %num)
                        print "已下载第%d 美照" %num
                        num += 1
                else:
                    continue
    except RequestException:
        print 'error'
num=1
if not  os.path.exists('jiepai'):
    os.mkdir('jiepai')
for i in range(0,1000,20):
    get_info(i,'街拍')

展开
收起
xuyuan.xy 2017-08-06 17:22:57 2835 0
0 条回答
写回答
取消 提交回答
问答分类:
问答地址:
问答排行榜
最热
最新

相关电子书

更多
微信广告引擎与播放节奏算法实践 立即下载
数据如何让运动更好玩 立即下载
数据如何让运动更好玩有趣 立即下载