爬虫入门之线程进程协程抓取方法(八)

简介: 1 多线程抓取import lxmlfrom lxml import etreeimport requestsimport threadingimport timerlock = threading.

1 多线程抓取


import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        # 西湖 https://hz.lianjia.com/ershoufang/xihu/  将其变成字典
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=gethouseInfo, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()
    print(time.clock())
AI 代码解读

2 多协程抓取

import gevent
from gevent import monkey
gevent.monkey.patch_all()   #有些需要刚开始进行初始化
import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多协程
    # gevent.monkey.patch_all()  # 非阻塞io  如果此处不行则需要在最上方导入
    geventList = []
    for k, v in areaDict.items():
        g = gevent.spawn(gethouseInfo, k, v)
        geventList.append(g)
    gevent.joinall(geventList)
    print(time.clock())
AI 代码解读

3 多进程抓取

import lxml
from lxml import etree
import requests

import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多进程
    processList = []
    for areaName, url in areaDict.items():
        t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
        # 开启
        processList.append(t)
        t.start()

    # 保证线程都结束
    for i in processList:
        i.join()
    print(time.clock())
AI 代码解读

4 多线程加协程

import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

def getPageNum(areaName, url):
    '''
    获取当前页面
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=getPageNum, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()

    print(time.clock())
AI 代码解读

5 多进程加协程


import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]
        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
        # 总价
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        print(areaName)
        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()


def getPageNum(areaName, url):

    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    processList = []
    for areaName, url in areaDict.items():
        # 开启多进程
        p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
        processList.append(p)
        p.start()

    # 保证进程都结束
    for i in processList:
        i.join()
    print(time.clock())
AI 代码解读
目录
打赏
0
0
0
0
2
分享
相关文章
|
13天前
|
【Java并发】【线程池】带你从0-1入门线程池
欢迎来到我的技术博客!我是一名热爱编程的开发者,梦想是编写高端CRUD应用。2025年我正在沉淀中,博客更新速度加快,期待与你一起成长。 线程池是一种复用线程资源的机制,通过预先创建一定数量的线程并管理其生命周期,避免频繁创建/销毁线程带来的性能开销。它解决了线程创建成本高、资源耗尽风险、响应速度慢和任务执行缺乏管理等问题。
127 60
【Java并发】【线程池】带你从0-1入门线程池
如何区分进程、线程和协程?看这篇就够了!
本课程主要探讨操作系统中的进程、线程和协程的区别。进程是资源分配的基本单位,具有独立性和隔离性;线程是CPU调度的基本单位,轻量且共享资源,适合并发执行;协程更轻量,由程序自身调度,适合I/O密集型任务。通过学习这些概念,可以更好地理解和应用它们,以实现最优的性能和资源利用。
81 11
构建高效的Python网络爬虫:从入门到实践
本文旨在通过深入浅出的方式,引导读者从零开始构建一个高效的Python网络爬虫。我们将探索爬虫的基本原理、核心组件以及如何利用Python的强大库进行数据抓取和处理。文章不仅提供理论指导,还结合实战案例,让读者能够快速掌握爬虫技术,并应用于实际项目中。无论你是编程新手还是有一定基础的开发者,都能在这篇文章中找到有价值的内容。
Java中的多线程编程:从入门到实践####
本文将深入浅出地探讨Java多线程编程的核心概念、应用场景及实践技巧。不同于传统的摘要形式,本文将以一个简短的代码示例作为开篇,直接展示多线程的魅力,随后再详细解析其背后的原理与实现方式,旨在帮助读者快速理解并掌握Java多线程编程的基本技能。 ```java // 简单的多线程示例:创建两个线程,分别打印不同的消息 public class SimpleMultithreading { public static void main(String[] args) { Thread thread1 = new Thread(() -> System.out.prin
Python爬虫定义入门知识
Python爬虫是用于自动化抓取互联网数据的程序。其基本概念包括爬虫、请求、响应和解析。常用库有Requests、BeautifulSoup、Scrapy和Selenium。工作流程包括发送请求、接收响应、解析数据和存储数据。注意事项包括遵守Robots协议、避免过度请求、处理异常和确保数据合法性。Python爬虫强大而灵活,但使用时需遵守法律法规。
|
4月前
|
Java中的多线程编程入门
【10月更文挑战第29天】在Java的世界中,多线程就像是一场精心编排的交响乐。每个线程都是乐团中的一个乐手,他们各自演奏着自己的部分,却又和谐地共同完成整场演出。本文将带你走进Java多线程的世界,让你从零基础到能够编写基本的多线程程序。
52 1
Java多线程编程的艺术:从入门到精通####
【10月更文挑战第21天】 本文将深入探讨Java多线程编程的核心概念,通过生动实例和实用技巧,引导读者从基础认知迈向高效并发编程的殿堂。我们将一起揭开线程管理的神秘面纱,掌握同步机制的精髓,并学习如何在实际项目中灵活运用这些知识,以提升应用性能与响应速度。 ####
67 3
Python中实现简单爬虫的入门指南
【10月更文挑战第22天】本文将带你进入Python爬虫的世界,从基础概念到实战操作,一步步指导你如何使用Python编写一个简单的网络爬虫。我们将不展示代码示例,而是通过详细的步骤描述和逻辑讲解,帮助你理解爬虫的工作原理和开发过程。无论你是编程新手还是有一定经验的开发者,这篇文章都将为你打开一扇通往数据收集新世界的大门。
Java中的多线程编程:从入门到精通
本文将带你深入了解Java中的多线程编程。我们将从基础概念开始,逐步深入探讨线程的创建、启动、同步和通信等关键知识点。通过阅读本文,你将能够掌握Java多线程编程的基本技能,为进一步学习和应用打下坚实的基础。
|
5月前
|
Linux查找占用的端口,并杀死进程的简单方法
通过上述步骤和命令,您能够迅速识别并根据实际情况管理Linux系统中占用特定端口的进程。为了获得更全面的服务器管理技巧和解决方案,提供了丰富的资源和专业服务,是您提升运维技能的理想选择。
209 1

热门文章

最新文章

相关实验场景

更多
AI助理

你好,我是AI助理

可以解答问题、推荐解决方案等