爬取 http://www.xiaohuar.com/ 美女校花 图片的爬虫
# -*- coding:utf-8 -*-
import os
import requests
# from PIL import Image
from lxml import etree
class Spider(object):
""" crawl image """
def __init__(self):
self.index = 0
self.url = "http://www.xiaohuar.com"
self.proxies = {"http": "http://172.17.18.80:8080", "https": "https://172.17.18.80:8080"}
pass
def download_image(self, image_url):
real_url = self.url + image_url
print "downloading the {0} image".format(self.index)
with open("{0}.jpg".format(self.index), 'wb') as f:
self.index += 1
f.write(requests.get(real_url, proxies=self.proxies).content)
pass
pass
def start_crawl(self):
start_url = "http://www.xiaohuar.com/hua/"
r = requests.get(start_url, proxies=self.proxies)
if r.status_code == 200:
temp = r.content.decode("gbk")
html = etree.HTML(temp)
links = html.xpath('//div[@class="item_t"]//img/@src')
map(self.download_image, links)
# next_page_url = html.xpath('//div[@class="page_num"]//a/text()')
# print next_page_url[-1]
# print next_page_url[-2]
# print next_page_url[-3]
next_page_url = html.xpath(u'//div[@class="page_num"]//a[contains(text(),"下一页")]/@href')
page_num = 2
while next_page_url:
print "download {0} page images".format(page_num)
r_next = requests.get(next_page_url[0], proxies=self.proxies)
if r_next.status_code == 200:
html = etree.HTML(r_next.content.decode("gbk"))
links = html.xpath('//div[@class="item_t"]//img/@src')
map(self.download_image, links)
try:
next_page_url = html.xpath(u'//div[@class="page_num"]//a[contains(text(),"下一页")]/@href')
except BaseException as e:
next_page_url = None
# print e
page_num += 1
pass
else:
print "response status code : {0}".format(r_next.status_code)
pass
else:
print "response status code : {0}".format(r.status_code)
pass
if __name__ == "__main__":
t = Spider()
t.start_crawl()
pause = raw_input("press any key to continue")
pass