如何使用Python playwright采集网页数据不被检测到?

我经常需要用playwright自动化浏览器,去收集一些网站数据,怎么能保证稳定,必备检测到?

展开
收起
py世界 2025-10-26 12:20:24 38 分享 版权
2 条回答
写回答
取消 提交回答
  • 北京阿里云ACE会长

    关键防检测要点

    1. 指纹多样性:每次会话使用不同的 User-Agent、视窗大小、时区
    2. 行为随机化:添加随机延迟、模拟人类鼠标移动和滚动
    3. 自动化特征移除:清除 webdriver 特征和 Chrome 自动化标志
    4. IP 轮换:使用高质量代理池避免 IP 被封
    5. 会话管理:定期更换浏览器上下文和 Cookie

    这些措施能显著提高采集稳定性,但要注意遵守网站的 robots.txt 和服务条款。

    1. 浏览器指纹伪装

    import asyncio
    from playwright.async_api import async_playwright
    
    async def create_stealth_browser():
        playwright = await async_playwright().start()
    
        # 使用 Chromium 并随机化视窗大小
        browser = await playwright.chromium.launch(
            headless=False,  # 建议使用非无头模式
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-features=VizDisplayCompositor',
                '--no-first-run',
                '--no-default-browser-check',
                '--disable-default-apps',
                '--disable-dev-shm-usage',
                '--disable-accelerated-2d-canvas',
                '--no-zygote',
                f'--window-size={random.randint(1200, 1920)},{random.randint(800, 1080)}'
            ]
        )
    
        # 创建上下文并设置高级伪装
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            permissions=[],
            geolocation=None,
            locale='zh-CN',
            timezone_id='Asia/Shanghai',
            color_scheme='light',
            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
            }
        )
    
        # 移除自动化特征
        await context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined,
            });
            delete navigator.__proto__.webdriver;
    
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5],
            });
    
            window.chrome = {
                runtime: {},
            };
        """)
    
        return browser, context
    

    2. 人类行为模拟

    import random
    import time
    from playwright.async_api import Page
    
    class HumanBehavior:
        @staticmethod
        async def random_delay(page: Page, min_delay=1, max_delay=3):
            """随机延迟"""
            await page.wait_for_timeout(random.randint(min_delay * 1000, max_delay * 1000))
    
        @staticmethod
        async def human_click(page: Page, selector: str):
            """模拟人类点击行为"""
            element = await page.wait_for_selector(selector)
    
            # 随机移动鼠标到元素
            box = await element.bounding_box()
            if box:
                x = box['x'] + box['width'] * random.uniform(0.3, 0.7)
                y = box['y'] + box['height'] * random.uniform(0.3, 0.7)
    
                # 分段移动鼠标
                steps = random.randint(3, 8)
                for i in range(steps):
                    progress = (i + 1) / steps
                    current_x = x * progress + random.uniform(-5, 5)
                    current_y = y * progress + random.uniform(-5, 5)
                    await page.mouse.move(current_x, current_y)
                    await page.wait_for_timeout(random.randint(50, 150))
    
            await element.click()
    
        @staticmethod
        async def human_scroll(page: Page):
            """模拟人类滚动行为"""
            scroll_steps = random.randint(3, 8)
            for _ in range(scroll_steps):
                scroll_amount = random.randint(200, 800)
                await page.evaluate(f"window.scrollBy(0, {scroll_amount})")
                await page.wait_for_timeout(random.randint(500, 2000))
    
        @staticmethod
        async def human_typing(page: Page, selector: str, text: str):
            """模拟人类输入"""
            await page.click(selector)
            for char in text:
                await page.type(selector, char, delay=random.uniform(50, 200))
                await page.wait_for_timeout(random.randint(50, 150))
    

    3. 代理和会话管理

    class AntiDetectionManager:
        def __init__(self, proxy_list=None):
            self.proxy_list = proxy_list or []
            self.current_proxy_index = 0
    
        def get_next_proxy(self):
            """轮换代理"""
            if not self.proxy_list:
                return None
    
            proxy = self.proxy_list[self.current_proxy_index]
            self.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxy_list)
            return proxy
    
        async def rotate_session(self, browser, context):
            """轮换会话"""
            await context.close()
            proxy = self.get_next_proxy()
    
            new_context = await browser.new_context(
                user_agent=self.get_random_user_agent(),
                proxy=proxy
            )
    
            # 重新应用伪装脚本
            await new_context.add_init_script(self.get_stealth_script())
            return new_context
    
        @staticmethod
        def get_random_user_agent():
            """获取随机 User-Agent"""
            user_agents = [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
            ]
            return random.choice(user_agents)
    

    4. 完整的采集示例

    async def stealthy_data_collection(url: str, selectors: dict):
        """完整的防检测数据采集流程"""
    
        # 创建伪装浏览器
        browser, context = await create_stealth_browser()
        page = await context.new_page()
    
        try:
            # 访问页面
            await page.goto(url, wait_until='networkidle')
    
            # 模拟人类行为
            await HumanBehavior.random_delay(page, 2, 5)
            await HumanBehavior.human_scroll(page)
    
            # 采集数据
            data = {}
            for key, selector in selectors.items():
                try:
                    elements = await page.query_selector_all(selector)
                    data[key] = [await element.text_content() for element in elements]
                    await HumanBehavior.random_delay(page, 1, 2)
                except Exception as e:
                    print(f"采集 {key} 时出错: {e}")
                    data[key] = []
    
            return data
    
        finally:
            await context.close()
            await browser.close()
    
    # 使用示例
    async def main():
        url = "https://example.com/jobs"
        selectors = {
            "job_titles": ".job-title",
            "companies": ".company-name", 
            "locations": ".job-location",
            "salaries": ".salary-range"
        }
    
        data = await stealthy_data_collection(url, selectors)
        print(data)
    
    2025-10-27 16:49:18
    赞同 27 展开评论
  • 一般来说,playwright和其他各种爬虫库一样,如果不做处理,采集数据时肯定会被网站检测并封禁, 因为现在各大网站对于自动化采集程序越来越严格,不光限制ip、浏览器,还会识别行为指纹、设置动态加载门槛,所以想要采集网页不被检测,需要做到以下三点:
    1、浏览器指纹伪装,修改真实user agent,隐藏自动化特征
    2、部署ip池,不定期切换ip,防止请求过频繁被识别
    3、模拟真人访问行为,设置随机延迟,模拟鼠标、键盘行为
    这几点想要做到不容易,我是觉得可以直接用第三方工具,比如像亮数据的网页抓取浏览器。
    这个是亮数据的远程浏览器,用palywright操作模式都一样,但它内置了动态住宅ip池,能自动随机切换ip访问,而且它有专门的识别和解锁验证码功能,不会被限制。
    playwright能直接连接它的api,不需要写额外的代码,直接提交url就能获取到网页数据,并解析为csv、json格式,确实很便捷。

    2025-10-26 15:54:12
    赞同 24 展开评论

大数据领域前沿技术分享与交流,这里不止有技术干货、学习心得、企业实践、社区活动,还有未来。

还有其他疑问?
咨询AI助理