爬虫入门之线程进程协程抓取方法(八)

本文涉及的产品
.cn 域名,1个 12个月
简介: 1 多线程抓取import lxmlfrom lxml import etreeimport requestsimport threadingimport timerlock = threading.

1 多线程抓取


import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        # 西湖 https://hz.lianjia.com/ershoufang/xihu/  将其变成字典
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=gethouseInfo, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()
    print(time.clock())

2 多协程抓取

import gevent
from gevent import monkey
gevent.monkey.patch_all()   #有些需要刚开始进行初始化
import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多协程
    # gevent.monkey.patch_all()  # 非阻塞io  如果此处不行则需要在最上方导入
    geventList = []
    for k, v in areaDict.items():
        g = gevent.spawn(gethouseInfo, k, v)
        geventList.append(g)
    gevent.joinall(geventList)
    print(time.clock())

3 多进程抓取

import lxml
from lxml import etree
import requests

import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多进程
    processList = []
    for areaName, url in areaDict.items():
        t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
        # 开启
        processList.append(t)
        t.start()

    # 保证线程都结束
    for i in processList:
        i.join()
    print(time.clock())

4 多线程加协程

import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

def getPageNum(areaName, url):
    '''
    获取当前页面
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=getPageNum, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()

    print(time.clock())

5 多进程加协程


import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]
        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
        # 总价
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        print(areaName)
        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()


def getPageNum(areaName, url):

    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    processList = []
    for areaName, url in areaDict.items():
        # 开启多进程
        p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
        processList.append(p)
        p.start()

    # 保证进程都结束
    for i in processList:
        i.join()
    print(time.clock())
相关文章
|
7天前
|
数据采集 存储 JSON
Python爬虫开发:BeautifulSoup、Scrapy入门
在现代网络开发中,网络爬虫是一个非常重要的工具。它可以自动化地从网页中提取数据,并且可以用于各种用途,如数据收集、信息聚合和内容监控等。在Python中,有多个库可以用于爬虫开发,其中BeautifulSoup和Scrapy是两个非常流行的选择。本篇文章将详细介绍这两个库,并提供一个综合详细的例子,展示如何使用它们来进行网页数据爬取。
|
11天前
|
Go 调度 开发者
[go 面试] 深入理解进程、线程和协程的概念及区别
[go 面试] 深入理解进程、线程和协程的概念及区别
|
4天前
|
算法 Java
JUC(1)线程和进程、并发和并行、线程的状态、lock锁、生产者和消费者问题
该博客文章综合介绍了Java并发编程的基础知识,包括线程与进程的区别、并发与并行的概念、线程的生命周期状态、`sleep`与`wait`方法的差异、`Lock`接口及其实现类与`synchronized`关键字的对比,以及生产者和消费者问题的解决方案和使用`Condition`对象替代`synchronized`关键字的方法。
JUC(1)线程和进程、并发和并行、线程的状态、lock锁、生产者和消费者问题
|
7天前
|
调度 Android开发 开发者
【颠覆传统!】Kotlin协程魔法:解锁Android应用极速体验,带你领略多线程优化的无限魅力!
【8月更文挑战第12天】多线程对现代Android应用至关重要,能显著提升性能与体验。本文探讨Kotlin中的高效多线程实践。首先,理解主线程(UI线程)的角色,避免阻塞它。Kotlin协程作为轻量级线程,简化异步编程。示例展示了如何使用`kotlinx.coroutines`库创建协程,执行后台任务而不影响UI。此外,通过协程与Retrofit结合,实现了网络数据的异步加载,并安全地更新UI。协程不仅提高代码可读性,还能确保程序高效运行,不阻塞主线程,是构建高性能Android应用的关键。
26 4
|
19天前
|
消息中间件 程序员 调度
如何区分进程、线程和协程?看这篇就够了!
以下是内容摘要,已简化并保持在240字符以内: 嗨,我是小米!今天聊聊进程、线程和协程: - **进程**:资源分配基本单位,独立且隔离。 - **线程**:进程内执行单元,轻量级且共享资源。 - **协程**:比线程更轻量,适合I/O密集型任务。 每种都有独特特点和适用场景,选择合适可优化性能。希望对你有所帮助!更多内容,请关注我的公众号“软件求生”。
28 1
|
3天前
|
Dart API C语言
Dart ffi 使用问题之想在C/C++中创建异步线程来调用Dart方法,如何操作
Dart ffi 使用问题之想在C/C++中创建异步线程来调用Dart方法,如何操作
|
4天前
|
Java UED
基于SpringBoot自定义线程池实现多线程执行方法,以及多线程之间的协调和同步
这篇文章介绍了在SpringBoot项目中如何自定义线程池来实现多线程执行方法,并探讨了多线程之间的协调和同步问题,提供了相关的示例代码。
23 0
|
15天前
|
数据采集 并行计算 程序员
Python中的并发编程:理解多线程与多进程
在Python编程中,理解并发编程是提升程序性能和效率的关键。本文将深入探讨Python中的多线程和多进程编程模型,比较它们的优劣势,并提供实际应用中的最佳实践与案例分析。
|
1天前
|
Java
多线程线程同步
多线程的锁有几种方式
|
8天前
|
调度 Python

相关实验场景

更多