python下载网站文件-阿里云开发者社区

开发者社区> 开发与运维> 正文

python下载网站文件

简介: 场景说明:1、定时从网站下载程序文件;2、定时清理文件,以免占用磁盘空间过大; 程序功能:1、使用urllib2,urllib类从网站抓取数据,并下载到指定路径;2、为避免重复下载,在下载前做数据对比;3、使用多线程,一个实现下载的功能,另一个实现清理功能;4、每24小时执行一次。

场景说明:
1、定时从网站下载程序文件;
2、定时清理文件,以免占用磁盘空间过大;

程序功能:
1、使用urllib2,urllib类从网站抓取数据,并下载到指定路径;
2、为避免重复下载,在下载前做数据对比;
3、使用多线程,一个实现下载的功能,另一个实现清理功能;
4、每24小时执行一次。

import urllib2,urllib
import re
import os,sys
import time
import datetime
import threading

proxy_info={'user':'user', 'password':'xxxxxx' , 'server':'http://xxx:8080'}
url1 = "http://xxx.com/"
path=r'x:\download'

con=threading.Condition()
def downloadpatch(path,url1):
    if con.acquire():
        while 1:
            print '               start thread of downloadpatch'
            print 'present time is: ',datetime.datetime.now()
            passmgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
            passmgr.add_password(None, proxy_info['server'] , proxy_info['user'], proxy_info['password'])
            auth = urllib2.ProxyBasicAuthHandler(passmgr)
            opener = urllib2.build_opener(urllib2.ProxyHandler({'http':proxy_info['server']}) , auth)
            urllib2.install_opener(opener)
            proname = urllib2.urlopen(url1)
            text=proname.read()
            print 'connect to website successfully'
            #print text
            name=re.findall('HREF="(\d+xdat.exe)"',text,re.IGNORECASE)
            #name=re.findall('HREF="(readme.txt)"',text,re.IGNORECASE)
            print 'the following files are all patchs in the website: '
            print name
        
            files=os.listdir(path)
            for i in files:
                for x in name:
                    if i==x:
                        name.remove(x)
            if len(name)>0:   
                print 'the following files are need to download:'
                print name
                print 'please wait......'
                for i in name:
                    f=open(path+'\\'+i,'wb')
                    downpro=urllib2.urlopen(url1+i)
                    while 1:
                        data=downpro.read(1024)
                        if not len(data):
                            break
                        f.write(data)
                    f.close()
                    print '%s files have download!!!'%i
                    f1=open(path+'\\'+'log'+'\\'+'log.txt','a')
                    f1.write(str(datetime.datetime.now())+' ')
                    f1.write('%s files have download!!!'%i)
                    f1.write('\n')
                    f1.close()
            else:
                print 'no files have to download' 
            proname.close()
            print '--------------------------------------------'
            con.notify()
        
            con.wait()
            
            time.sleep(24*60*60)
            #time.sleep(10)

def deletepatch(yourpath):
    if con.acquire():
        while 1:
            print '               starting thread of delete files'
            print 'present time is  :',datetime.datetime.now()
            pathlist=os.listdir(yourpath)#list all files
            for i in range(len(pathlist)):#counts
                source=yourpath+'\\'+pathlist[i]#path of a file
                if os.path.isfile(source):#whether is file
                    m=time.localtime(os.stat(source).st_ctime)# create time of file
                    endtime=datetime.datetime.now()# now time
                    startime=datetime.datetime(m.tm_year,m.tm_mon,m.tm_mday,m.tm_hour,m.tm_min,m.tm_sec)
                    #translate the time 
                    mydays=(endtime-startime).days
                    if mydays>=7:#if time is over 7 days
                        os.remove(source)# remove the file
                        print 'File',source,'have been deleted'
                        f2=open(path+'\\'+'log'+'\\'+'log.txt','a')
                        f2.write(str(datetime.datetime.now()))
                        f2.write('File',source,'have been deleted')
                        f2.write('\n')
                        f2.close()
                    else:
                        print 'File',source,'is now useful for us'
                else:
                    print 'File',source,'is not execute program'
            print '--------------------------------------------'
            con.notify()
            con.wait()
            
            time.sleep(24*60*60)
            #time.sleep(10)
        
if __name__=='__main__':
    try:
        t1=threading.Thread(None,target=downloadpatch,args=(path,url1))
        t1.start()
        
        
        t2=threading.Thread(None,target=deletepatch,args=(path,))
        t2.start()
        
    except Exception,e:
        print e

版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。

分享:
开发与运维
使用钉钉扫一扫加入圈子
+ 订阅

集结各类场景实战经验,助你开发运维畅行无忧

其他文章