在数据采集领域,流媒体平台的元数据(如封面、简介、评分、播放量、评论数等)一直是个高频需求。
稍微动手跑过脚本的同学都知道,这块的骨头非常难啃。这些全球化平台普遍部署了极其严格的反爬机制,包括 IP 频率限制、请求头检测、TLS指纹校验等。今天就和大家分享一下,我是如何利用 Python 配合爬虫代理,构建一套开箱即用的采集方案。
为了方便各位同行直接 Copy 使用,下面我会放出核心的配置代码和实战 Demo。
一、 筑基:动态代理池配置
面对动辄封IP的流媒体平台,免费代理根本没法看。我在实战中主要使用的是基于隧道技术的亿牛云爬虫代理。 使用爬虫代理前,需要在其控制台获取 API 接口(如 http://v.16yun.cn/bills )和密钥对,并在控制台绑定本机白名单 IP。 在代码实现上,核心逻辑是 自动轮换 和 异常重试 。遇到 403 封禁或 429 频率限制时,强制提取新的代理并更新 Session。核心配置类 YiniuProxyConfig 源码参考:YiniuProxyConfig
import requests
class YiniuProxyConfig:
"""亿牛云爬虫代理配置"""
def __init__(self, api_url: str, username: str, password: str):
self.api_url = api_url.rstrip('/')
self.username = username
self.password = password
self.session = requests.Session()
self._current_proxy = None
def get_proxy(self) -> dict:
"""从亿牛云API获取代理,每次调用返回不同代理IP实现轮换"""
try:
# 亿牛云隧道代理提取格式
proxy_url = f"{self.api_url}?num=1&time=1&seq=1&quality=1&type=https"
resp = requests.get(proxy_url, timeout=10)
if resp.status_code == 200 and resp.text.strip():
proxy = resp.text.strip() # 格式: ip:port
self._current_proxy = proxy
# 拼接账密鉴权格式
return {
'http': f'http://{self.username}:{self.password}@{proxy}',
'https': f'http://{self.username}:{self.password}@{proxy}'
}
else:
raise RuntimeError(f"代理提取失败: {resp.status_code} - {resp.text}")
except Exception as e:
raise RuntimeError(f"代理获取异常: {e}")
def rotate_proxy(self) -> dict:
"""强制轮换代理(当请求失败时调用)"""
return self.get_proxy()
def make_request(self, method: str, url: str, **kwargs) -> requests.Response:
"""使用代理发送请求,自动处理代理切换"""
max_retries = 3
last_error = None
for attempt in range(max_retries):
try:
proxy = self.get_proxy()
kwargs['proxies'] = proxy
kwargs['timeout'] = kwargs.get('timeout', 30)
response = self.session.request(method, url, **kwargs)
# 检测是否被限制
if response.status_code in (403, 429):
self.rotate_proxy() # 代理被封,切换下一个
continue
return response
except requests.exceptions.ProxyError as e:
last_error = e
self.rotate_proxy()
continue
except requests.exceptions.Timeout as e:
last_error = e
continue
raise RuntimeError(f"请求失败,已重试{max_retries}次: {last_error}")
二、 伪装:死磕浏览器指纹检测
现在的检测很聪明,不仅仅看 User-Agent,我们需要生成一套随机但完全合法的请求头。针对 Chromium 内核,必须带上对应的请求头,以避免请求特征被序列化识别。浏览器指纹生成器源码参考:
import random
class BrowserFingerprint:
"""浏览器指纹模拟器"""
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# ... 更多主流 UA 省略
]
LANGUAGES = ["zh-CN,zh;q=0.9,en;q=0.8", "en-US,en;q=0.9,zh-CN;q=0.8"]
@classmethod
def random_headers(cls) -> dict:
"""生成随机但合法的请求头"""
return {
"User-Agent": random.choice(cls.USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": random.choice(cls.LANGUAGES),
"Accept-Encoding": "gzip, deflate, br",
"Sec-Ch-Ua": random.choice([
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'"Chromium";v="120", "Not-A.Brand";v="8"',
]),
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": random.choice(["max-age=0", "no-cache", ""]),
}
三、 实战:B站视频元数据采集 Demo
将代理配置和伪装头结合,我们可以轻松搞定目标平台。这里以 B 站为例,直接请求其 web-interface/view API 接口获取数据。完整可运行的采集示例:
import json
import time
from datetime import datetime
class BilibiliAdapter:
"""哔哩哔哩采集适配器"""
BASE_URL = "https://api.bilibili.com/x/web-interface/view"
def __init__(self, proxy_config: YiniuProxyConfig):
self.proxy = proxy_config
self.fingerprint = BrowserFingerprint()
def fetch(self, avid: str) -> dict:
"""执行单次采集流程"""
url = f"{self.BASE_URL}?aid={avid}"
headers = self.fingerprint.random_headers()
# 调用前面封装好的爬虫代理请求方法
response = self.proxy.make_request("GET", url, headers=headers)
if response.status_code == 200:
data = response.json()
if data['code'] == 0:
info = data['data']
stat = info.get('stat', {
})
return {
"title": info.get('title', ''),
"platform": "bilibili",
"view_count": stat.get('view', 0),
"like_count": stat.get('like', 0),
"scraped_at": datetime.now().isoformat(),
"proxy_used": self.proxy._current_proxy
}
return {
}
def main():
# 1. 初始化亿牛云代理配置(替换为你自己的凭证)
proxy_config = YiniuProxyConfig(
api_url="http://proxy.16yun.cn:9001",
username="your_username",
password="your_password"
)
# 2. 实例化适配器
adapter = BilibiliAdapter(proxy_config)
# 3. 执行采集 (示例 avid)
av_ids = ["170001", "506895002"]
for avid in av_ids:
try:
# 随机休眠防风控
time.sleep(random.uniform(1, 3))
result = adapter.fetch(avid)
print(f"采集成功: {json.dumps(result, ensure_ascii=False)}")
except Exception as e:
print(f"采集异常: {avid} - {e}")
if __name__ == "__main__":
main()
四、 高并发避坑指南:指数退避
在生产环境中进行高并发采集时,不可避免会遇到请求超时或被平台暂时频率限制(返回 429 等错误)。 为了保证代码的健壮性,强烈建议加入指数退避(Exponential Backoff)策略:每次重试的等待时间呈指数级增加,并加入随机抖动,避免瞬间爆发的大量重试将代理 IP 秒封。import time
from functools import wraps
def exponential_backoff(max_retries: int = 5, base_delay: float = 1.0):
"""适用于代理请求失败时的重试装饰器"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
last_exception = None
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
delay = base_delay * (2 ** attempt)
# 引入 ±25% 的随机抖动
jitter = delay * random.uniform(0.75, 1.25)
print(f"尝试 {attempt + 1}/{max_retries} 失败,{jitter:.1f}s后重试...")
time.sleep(jitter)
raise last_exception
return wrapper
return decorator