因为工作的关系,我写过许多个抓取网站信息的程序。 最简单的,只要用Python的urllib2.urlopen()函数就可以了; 然后,有个网站喜欢封人,所以,得找一批代理,轮流抓它的信息; 有的网站不允许程序抓取,所以,就得加入一些头信息; 有的网站需要登录,这时就要用到Cookies; 最后,为了提高效率,最好是使用多线程。(PS,有个地方要注意,urlopen这个函数,设定了一个全局对象opener,所以如果你使用了多个线程, 每个线程使用一个代理,那么,不能使用urlopen这个函数,而应该使用opener.open) 下面是我用Python写的一个抓代理的脚本,虽然现在已经不在教育网内部了,不过有时候还是需要用一下代理的:) # -*- coding: cp936 -*- import urllib2,re,thread,time import socket socket.setdefaulttimeout(10) #-----------------------定义抓取代理的函数-------------------------------# def getcnproxy(name): pagenum=0 result=[] getallpages=0 trycount=0 while getallpages==0 and trycount<=6: pagenum=pagenum+1 url='http://www.proxycn.com/html_proxy/http-'+str(pagenum)+'.html' try: html=urllib2.urlopen(url) ip='' for line in html: if '''onDblClick="clip''' in line: proxy=line[line.find("clip('")+6:line.find("')")] lock.acquire() print name,proxy lock.release() result.append(proxy) if '下一页|尾页' in line: getallpages=1 except: trycount=trycount+1 pagenum=pagenum-1 proxylist[0]=result return result def getproxycn(name): pagenum=0 result=[] getallpages=0 trycount=0 while pagenum<=9 and trycount<=2: pagenum=pagenum+1 url='http://www.cnproxy.com/proxy'+str(pagenum)+'.html' try: html=urllib2.urlopen(url) for line in html: if "HTTP" in line: proxy=line[line.find('<td>')+4:line.find('̴')]+line[line.find(':'):line.find('</td><td>')] lock.acquire() print name,proxy lock.release() result.append(proxy) except: trycount=trycount+1 pagenum=pagenum-1 proxylist[1]=result return result #------------------------- --------------- 结束代理抓取函数定义 --------------------------------------------------# #------------------------------------------ 验证代理的函数定义 ---------------------------------------------------# def proxycheckone(proxy): url='http://www.facebook.com' proxy_url = 'http://'+proxy proxy_support = urllib2.ProxyHandler({'http': proxy_url}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) r=urllib2.Request(url) r.add_header("Accept-Language","zh-cn") #加入头信息,这样可以避免403错误 r.add_header("Content-Type","text/html; charset=gb2312") r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)") trycount=1 while trycount<=2: try: T0=time.time() f=opener.open(r) data=f.read() if 'Welcome to Facebook!' in data: T=time.time()-T0 break else:return [] except: time.sleep(3) trycount=trycount+1 if trycount>2: return [] else: return proxy+'$'+str(trycount)+'#'+str(T) def proxycheck(idnum): while 1: r.acquire() try: i=proxylist[0] del proxylist[0] r.release() except: r.release() x[idnum]=1 break b=proxycheckone(i) if len(b)>0: a.acquire() y.append(b) a.release() #---------------------------------------- 验证代理的函数定义结束 -------------------------------------------------# #----------------------------- 抓取代理,抓取到的代理放在proxies.txt中,以\n分隔 --------------------------------# #x=''' lock=thread.allocate_lock() proxylist=[[],[]] thread.start_new(getcnproxy,('cnproxy',)) thread.start_new(getproxycn,('proxycn',)) while [] in proxylist: time.sleep(30) proxylist=proxylist[0]+proxylist[1] w=open('proxies.txt','a') w.write('\n'.join(proxylist)) w.close() del proxylist print 'get all proxies!\n\n' #''' #----------------------------- 抓取代理完毕,抓取到的代理放在proxies.txt中,以\n分隔 -------------------------------# #--------------------------------------------------- 验证代理 -----------------------------------------------------# w=open('proxies.txt') proxylist=list(set((re.sub(r'(\t+[^\n]*\n|\n)',',',w.read())).split(','))) while '' in proxylist: del proxylist[proxylist.index('')] w.close() lock=thread.allocate_lock() r=thread.allocate_lock() a=thread.allocate_lock() y=[] x=[0]*120 for idnum in range(0,120): thread.start_new(proxycheck,(idnum,)) while 0 in x: print len(proxylist),sum(x),"left",len(y) time.sleep(10) w=open('proxies.txt','w') w.write(re.sub('^\n','',re.sub(r'\n+','\n','\n'.join(y)+'\n'))) w.close() #-------------------------------------------------- 验证代理完毕 --------------------------------------------------# |