引言:动态页面爬取的挑战与解决方案
在当今的互联网环境中,越来越多的网站采用JavaScript动态渲染技术来展示内容,传统的静态爬虫工具对此类页面束手无策。搜狗图片搜索正是这样一个典型应用,其瀑布流式的图片加载、动态滚动的页面设计以及复杂的AJAX请求,使得常规的Requests+BeautifulSoup组合难以有效抓取数据。
针对这一问题,本文将详细介绍如何使用Splash这一强大的JavaScript渲染服务,结合Scrapy框架,实现对搜狗图片动态页面的高效爬取。我们将从原理分析到实战代码,全面解析这一技术方案的实现过程。
技术架构解析
Splash核心工作原理
Splash是一个带有HTTP API的轻量级浏览器,专门为网页渲染而设计。它基于WebKit引擎,支持JavaScript执行、页面渲染和截图功能。与Selenium相比,Splash具有以下优势:
- 无头浏览器设计:无需图形界面,更适合服务器部署
- 异步处理能力:支持多个页面并行渲染
- 内存占用低:相比完整浏览器,资源消耗更少
- Lua脚本支持:可编写复杂交互逻辑
搜狗图片页面特点分析
搜狗图片搜索页面具有以下技术特征:
● 使用瀑布流布局,滚动触发动态加载
● 图片URL通过JavaScript动态生成
● 反爬机制:IP频率限制(推荐亿牛云代理)、请求头验证
● 异步数据加载:通过AJAX请求获取图片数据
环境搭建与配置 - Docker安装Splash服务
拉取Splash镜像
docker pull scrapinghub/splash
运行Splash容器
docker run -p 8050:8050 scrapinghub/splash
验证安装
- Python环境配置
- Scrapy项目创建与配置
创建Scrapy项目
scrapy startproject sogou_image_crawler
cd sogou_image_crawler
修改settings.py配置
import scrapy_splash
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
核心爬虫实现
- 基础爬虫类设计
import scrapy
from scrapy_splash import SplashRequest
import json
import time
from urllib.parse import quote
class SogouImageSpider(scrapy.Spider):
name = 'sogou_image'
allowed_domains = ['pic.sogou.com']
def __init__(self, keyword='风景', max_pages=10, *args, **kwargs):
super().__init__(*args, **kwargs)
self.keyword = keyword
self.max_pages = max_pages
self.base_url = f'https://pic.sogou.com/pics?query={quote(keyword)}'
def start_requests(self):
"""生成初始请求"""
for page in range(1, self.max_pages + 1):
url = f'{self.base_url}&start={page * 48}'
yield SplashRequest(
url,
self.parse_image_page,
args={
'wait': 2,
'timeout': 90,
'images': 0,
'resource_timeout': 10
},
meta={'page': page}
)
def parse_image_page(self, response):
"""解析图片列表页"""
page = response.meta['page']
self.logger.info(f'正在爬取第{page}页')
# 使用JavaScript提取图片数据
script = """
function main(splash)
local urls = {}
local elements = splash:select_all('.img-box a')
for _, element in ipairs(elements) do
local style = element.node:getAttribute('style')
if style then
local url_match = style:match('url%((.-)%)')
if url_match then
table.insert(urls, {
url = url_match,
title = element.node:getAttribute('title') or ''
})
end
end
end
-- 模拟滚动加载更多
splash:runjs("window.scrollTo(0, document.body.scrollHeight);")
splash:wait(1.5)
-- 再次获取新加载的图片
local new_elements = splash:select_all('.img-box a')
for _, element in ipairs(new_elements) do
local style = element.node:getAttribute('style')
if style then
local url_match = style:match('url%((.-)%)')
if url_match then
table.insert(urls, {
url = url_match,
title = element.node:getAttribute('title') or ''
})
end
end
end
return {
urls = urls,
html = splash:html()
}
end
"""
yield SplashRequest(
response.url,
self.extract_images,
endpoint='execute',
args={
'lua_source': script,
'timeout': 90
},
meta={'page': page}
)
def extract_images(self, response):
"""提取图片信息"""
try:
data = json.loads(response.text)
images = data.get('urls', [])
for img in images:
if not img.get('url'):
continue
item = {
'page': response.meta['page'],
'image_url': img['url'],
'title': img['title'],
'keyword': self.keyword,
'crawl_time': time.strftime('%Y-%m-%d %H:%M:%S'),
'referer': response.url
}
# 请求图片详情页获取高清图
if 'thumb' in img['url']:
hd_url = img['url'].replace('thumb', 'hd')
yield SplashRequest(
hd_url,
self.parse_hd_image,
meta={'item': item},
args={'wait': 1}
)
else:
yield item
except json.JSONDecodeError as e:
self.logger.error(f'JSON解析错误: {e}')
高级Lua脚本优化
-- advanced_image_extractor.lua
function main(splash, args)
-- 设置用户代理
splash:set_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")-- 访问页面
assert(splash:go(args.url))
assert(splash:wait(args.wait or 2))-- 创建结果表
local results = {images = {}, page_info = {}}
-- 多次滚动加载
local scroll_attempts = args.scroll_attempts or 3
for i = 1, scroll_attempts do-- 提取当前可见图片 local js_code = [[ function extractImages() { var images = []; var items = document.querySelectorAll('.img-box, .pic-item'); items.forEach(function(item) { var img = item.querySelector('img'); var link = item.querySelector('a'); if (img && img.src) { var imageInfo = { src: img.src, alt: img.alt || '', width: img.naturalWidth, height: img.naturalHeight, data_src: img.getAttribute('data-src') || '' }; if (link) { imageInfo.link = link.href; imageInfo.title = link.title || link.getAttribute('data-title') || ''; } images.push(imageInfo); } }); return images; } return extractImages(); ]] local current_images = splash:evaljs(js_code) -- 添加到结果 for _, img in ipairs(current_images) do table.insert(results.images, img) end -- 滚动页面 if i < scroll_attempts then splash:runjs("window.scrollBy(0, window.innerHeight * 1.5);") assert(splash:wait(1.5)) endend
-- 获取页面信息
results.page_info = {url = splash:url(), title = splash:evaljs("document.title"), image_count = #results.images, scroll_height = splash:evaljs("document.body.scrollHeight")}
return results
end- 图片管道处理
pipelines.py
import os
import requests
from PIL import Image
from io import BytesIO
import hashlib
class SogouImagePipeline:
def init(self, storage_path='./images'):
self.storage_path = storage_path
if not os.path.exists(storage_path):
os.makedirs(storage_path)
@classmethod
def from_crawler(cls, crawler):
return cls(
storage_path=crawler.settings.get('IMAGES_STORE', './images')
)
def process_item(self, item, spider):
"""处理并保存图片"""
try:
# 生成唯一文件名
img_url = item['image_url']
file_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
file_name = f"{item['keyword']}_{file_hash}.jpg"
file_path = os.path.join(self.storage_path, file_name)
# 下载图片
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': item.get('referer', 'https://pic.sogou.com/')
}
response = requests.get(img_url, headers=headers, timeout=10)
response.raise_for_status()
# 验证图片格式
try:
img = Image.open(BytesIO(response.content))
img.verify() # 验证完整性
# 保存图片
with open(file_path, 'wb') as f:
f.write(response.content)
# 更新item信息
item['file_name'] = file_name
item['file_path'] = file_path
item['file_size'] = len(response.content)
item['image_format'] = img.format
item['image_mode'] = img.mode
spider.logger.info(f"成功保存图片: {file_name}")
except Exception as e:
spider.logger.error(f"图片格式错误: {e}")
item['download_status'] = 'failed'
except requests.RequestException as e:
spider.logger.error(f"下载失败: {e}")
item['download_status'] = 'failed'
return item
- 反反爬策略集成
middlewares.py
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
"""随机用户代理中间件"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)
request.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,en;q=0.8'
总结
本文详细介绍了基于Splash的搜狗图片动态页面爬取方案。通过Splash的JavaScript渲染能力,我们能够有效处理现代Web应用中的动态内容加载问题。关键技术点包括:
- Splash服务的部署与配置
- Lua脚本编写技巧
- Scrapy框架的集成方法
- 反爬策略的应对方案
- 图片处理与存储优化
这种技术方案不仅适用于搜狗图片,也可推广到其他使用JavaScript动态渲染的网站。在实际应用中,建议根据具体需求调整爬取策略,并始终遵守相关法律法规和网站使用条款。