Python写的Web spider:
<span style="font-size:14px;"># web spider # author vince 2015/7/29 import urllib2 import re # get href content pattern = '<a(?:\\s+.+?)*?\\s+href=\"([h]{1}[^\"]*?)\"' t = set("") # collection of url def fecth(url): http_request = urllib2.Request(url) http_request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36') http_response = urllib2.urlopen(http_request) print http_response.code if http_response.code == 200: for i in range(0,2000): # 2000 rows html = http_response.readline() if html == '': break else: a = re.search(pattern, html) if a: for href in a.groups(): print href t.add(href) # main start #if __name__ == '__main__': url = 'http://blog.csdn.net/' # target site t.clear() t.add(url) while (len(t) != 0): uu = t.pop() print uu fecth(uu) </span>
如果没有设置User-Agent,有些网站会不让访问,报403