用python多线程抓取网站图片,速度极快。直接贴代码吧
import re import urllib import threading import time import socket socket.setdefaulttimeout(30) urls=[] j=0 for i in xrange(1,81): if (i-1)%4 == 0: j += 1 if ((j-1)%5) == 0 : j=1 site='http://xx.com/xz%02d/images/' %(j,i) urls.append(site) print urls[i-1] def mkdir(path): # 引入模块 import os # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 print path+u' 创建成功' # 创建目录操作函数 os.makedirs(path) return True else: # 如果目录存在则不创建,并提示目录已存在 print path+u' 目录已存在' return False def cbk(a,b,c): '''''回调函数 @a: 已经下载的数据块 @b: 数据块的大小 @c: 远程文件的大小 ''' per = 100.0 * a * b / c if per > 100: per = 100 print '%.2f%%' % per #url = 'http://www.sina.com.cn' local = 'd:\\mysite\\pic\\' d=0 mutex = threading.Lock() # mutex1 = threading.Lock() class MyThread(threading.Thread): def __init__(self, url, name): threading.Thread.__init__(self) self.url=url self.name=name def run(self): mutex.acquire() print print 'down from %s' % self.url time.sleep(1) mutex.release() try: urllib.urlretrieve(self.url, self.name) except Exception,e: print e time.sleep(1) urllib.urlretrieve(self.url, self.name) threads=[] for u in urls[84:]: d += 1 local = 'd:\\mysite\\pic\\%d\\' %d mkdir(local) print 'download begin...' for i in xrange(40): lcal = local url=u url += '%03d.jpg' %i lcal += '%03d.jpg' %i th = MyThread(url,lcal) threads.append(th) th.start() # for t in threads: # t.join() print 'over! download finished'
其中urls为图片的网址,需要根据自己需要来改,例子中用xx.com代替。
在介绍个单线程下载的例子吧,以抓取暴走漫画图片为例:
from bs4 import BeautifulSoup import os, sys, urllib2,time,random # 创建文件夹,昨天刚学会 path = os.getcwd() # 获取此脚本所在目录 new_path = os.path.join(path,u'暴走漫画') if not os.path.isdir(new_path): os.mkdir(new_path) def page_loop(page=1): url = 'http://baozoumanhua.com/all/hot/page/%s?sv=1389537379' % page content = urllib2.urlopen(url) soup = BeautifulSoup(content) my_girl = soup.find_all('div',class_='img-wrap') for girl in my_girl: jokes = girl.find('img') link = jokes.get('src') flink = link print flink content2 = urllib2.urlopen(flink).read() #with open(u'暴走漫画'+'/'+time.strftime('%H-%M-%S')+random.choice('qwertyuiopasdfghjklzxcvbnm')+flink[-5:],'wb') as code: #在OSC上现学的 with open(u'暴走漫画'+'/'+flink[-11:],'wb') as code: code.write(content2) page = int(page) + 1 print u'开始抓取下一页' print 'the %s page' % page page_loop(page) page_loop()