我经常需要用playwright自动化浏览器,去收集一些网站数据,怎么能保证稳定,必备检测到?
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。
这些措施能显著提高采集稳定性,但要注意遵守网站的 robots.txt 和服务条款。
import asyncio
from playwright.async_api import async_playwright
async def create_stealth_browser():
playwright = await async_playwright().start()
# 使用 Chromium 并随机化视窗大小
browser = await playwright.chromium.launch(
headless=False, # 建议使用非无头模式
args=[
'--disable-blink-features=AutomationControlled',
'--disable-features=VizDisplayCompositor',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-zygote',
f'--window-size={random.randint(1200, 1920)},{random.randint(800, 1080)}'
]
)
# 创建上下文并设置高级伪装
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
permissions=[],
geolocation=None,
locale='zh-CN',
timezone_id='Asia/Shanghai',
color_scheme='light',
extra_http_headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
}
)
# 移除自动化特征
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
delete navigator.__proto__.webdriver;
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
window.chrome = {
runtime: {},
};
""")
return browser, context
import random
import time
from playwright.async_api import Page
class HumanBehavior:
@staticmethod
async def random_delay(page: Page, min_delay=1, max_delay=3):
"""随机延迟"""
await page.wait_for_timeout(random.randint(min_delay * 1000, max_delay * 1000))
@staticmethod
async def human_click(page: Page, selector: str):
"""模拟人类点击行为"""
element = await page.wait_for_selector(selector)
# 随机移动鼠标到元素
box = await element.bounding_box()
if box:
x = box['x'] + box['width'] * random.uniform(0.3, 0.7)
y = box['y'] + box['height'] * random.uniform(0.3, 0.7)
# 分段移动鼠标
steps = random.randint(3, 8)
for i in range(steps):
progress = (i + 1) / steps
current_x = x * progress + random.uniform(-5, 5)
current_y = y * progress + random.uniform(-5, 5)
await page.mouse.move(current_x, current_y)
await page.wait_for_timeout(random.randint(50, 150))
await element.click()
@staticmethod
async def human_scroll(page: Page):
"""模拟人类滚动行为"""
scroll_steps = random.randint(3, 8)
for _ in range(scroll_steps):
scroll_amount = random.randint(200, 800)
await page.evaluate(f"window.scrollBy(0, {scroll_amount})")
await page.wait_for_timeout(random.randint(500, 2000))
@staticmethod
async def human_typing(page: Page, selector: str, text: str):
"""模拟人类输入"""
await page.click(selector)
for char in text:
await page.type(selector, char, delay=random.uniform(50, 200))
await page.wait_for_timeout(random.randint(50, 150))
class AntiDetectionManager:
def __init__(self, proxy_list=None):
self.proxy_list = proxy_list or []
self.current_proxy_index = 0
def get_next_proxy(self):
"""轮换代理"""
if not self.proxy_list:
return None
proxy = self.proxy_list[self.current_proxy_index]
self.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxy_list)
return proxy
async def rotate_session(self, browser, context):
"""轮换会话"""
await context.close()
proxy = self.get_next_proxy()
new_context = await browser.new_context(
user_agent=self.get_random_user_agent(),
proxy=proxy
)
# 重新应用伪装脚本
await new_context.add_init_script(self.get_stealth_script())
return new_context
@staticmethod
def get_random_user_agent():
"""获取随机 User-Agent"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
]
return random.choice(user_agents)
async def stealthy_data_collection(url: str, selectors: dict):
"""完整的防检测数据采集流程"""
# 创建伪装浏览器
browser, context = await create_stealth_browser()
page = await context.new_page()
try:
# 访问页面
await page.goto(url, wait_until='networkidle')
# 模拟人类行为
await HumanBehavior.random_delay(page, 2, 5)
await HumanBehavior.human_scroll(page)
# 采集数据
data = {}
for key, selector in selectors.items():
try:
elements = await page.query_selector_all(selector)
data[key] = [await element.text_content() for element in elements]
await HumanBehavior.random_delay(page, 1, 2)
except Exception as e:
print(f"采集 {key} 时出错: {e}")
data[key] = []
return data
finally:
await context.close()
await browser.close()
# 使用示例
async def main():
url = "https://example.com/jobs"
selectors = {
"job_titles": ".job-title",
"companies": ".company-name",
"locations": ".job-location",
"salaries": ".salary-range"
}
data = await stealthy_data_collection(url, selectors)
print(data)
一般来说,playwright和其他各种爬虫库一样,如果不做处理,采集数据时肯定会被网站检测并封禁, 因为现在各大网站对于自动化采集程序越来越严格,不光限制ip、浏览器,还会识别行为指纹、设置动态加载门槛,所以想要采集网页不被检测,需要做到以下三点:
1、浏览器指纹伪装,修改真实user agent,隐藏自动化特征
2、部署ip池,不定期切换ip,防止请求过频繁被识别
3、模拟真人访问行为,设置随机延迟,模拟鼠标、键盘行为
这几点想要做到不容易,我是觉得可以直接用第三方工具,比如像亮数据的网页抓取浏览器。
这个是亮数据的远程浏览器,用palywright操作模式都一样,但它内置了动态住宅ip池,能自动随机切换ip访问,而且它有专门的识别和解锁验证码功能,不会被限制。
playwright能直接连接它的api,不需要写额外的代码,直接提交url就能获取到网页数据,并解析为csv、json格式,确实很便捷。