爬取 http://www.xiaohuar.com/ 美女校花 图片的爬虫
# -*- coding:utf-8 -*- import os import requests # from PIL import Image from lxml import etree class Spider(object): """ crawl image """ def __init__(self): self.index = 0 self.url = "http://www.xiaohuar.com" self.proxies = {"http": "http://172.17.18.80:8080", "https": "https://172.17.18.80:8080"} pass def download_image(self, image_url): real_url = self.url + image_url print "downloading the {0} image".format(self.index) with open("{0}.jpg".format(self.index), 'wb') as f: self.index += 1 f.write(requests.get(real_url, proxies=self.proxies).content) pass pass def start_crawl(self): start_url = "http://www.xiaohuar.com/hua/" r = requests.get(start_url, proxies=self.proxies) if r.status_code == 200: temp = r.content.decode("gbk") html = etree.HTML(temp) links = html.xpath('//div[@class="item_t"]//img/@src') map(self.download_image, links) # next_page_url = html.xpath('//div[@class="page_num"]//a/text()') # print next_page_url[-1] # print next_page_url[-2] # print next_page_url[-3] next_page_url = html.xpath(u'//div[@class="page_num"]//a[contains(text(),"下一页")]/@href') page_num = 2 while next_page_url: print "download {0} page images".format(page_num) r_next = requests.get(next_page_url[0], proxies=self.proxies) if r_next.status_code == 200: html = etree.HTML(r_next.content.decode("gbk")) links = html.xpath('//div[@class="item_t"]//img/@src') map(self.download_image, links) try: next_page_url = html.xpath(u'//div[@class="page_num"]//a[contains(text(),"下一页")]/@href') except BaseException as e: next_page_url = None # print e page_num += 1 pass else: print "response status code : {0}".format(r_next.status_code) pass else: print "response status code : {0}".format(r.status_code) pass if __name__ == "__main__": t = Spider() t.start_crawl() pause = raw_input("press any key to continue") pass