爬虫入门之线程进程协程抓取方法(八)

简介: 1 多线程抓取import lxmlfrom lxml import etreeimport requestsimport threadingimport timerlock = threading.

1 多线程抓取


import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        # 西湖 https://hz.lianjia.com/ershoufang/xihu/  将其变成字典
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=gethouseInfo, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()
    print(time.clock())

2 多协程抓取

import gevent
from gevent import monkey
gevent.monkey.patch_all()   #有些需要刚开始进行初始化
import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多协程
    # gevent.monkey.patch_all()  # 非阻塞io  如果此处不行则需要在最上方导入
    geventList = []
    for k, v in areaDict.items():
        g = gevent.spawn(gethouseInfo, k, v)
        geventList.append(g)
    gevent.joinall(geventList)
    print(time.clock())

3 多进程抓取

import lxml
from lxml import etree
import requests

import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多进程
    processList = []
    for areaName, url in areaDict.items():
        t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
        # 开启
        processList.append(t)
        t.start()

    # 保证线程都结束
    for i in processList:
        i.join()
    print(time.clock())

4 多线程加协程

import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

def getPageNum(areaName, url):
    '''
    获取当前页面
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=getPageNum, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()

    print(time.clock())

5 多进程加协程


import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]
        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
        # 总价
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        print(areaName)
        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()


def getPageNum(areaName, url):

    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    processList = []
    for areaName, url in areaDict.items():
        # 开启多进程
        p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
        processList.append(p)
        p.start()

    # 保证进程都结束
    for i in processList:
        i.join()
    print(time.clock())
相关文章
|
3月前
|
数据采集 存储 前端开发
动态渲染爬虫:Selenium抓取京东关键字搜索结果
动态渲染爬虫:Selenium抓取京东关键字搜索结果
|
3月前
|
数据采集 存储 前端开发
Java爬虫性能优化:多线程抓取JSP动态数据实践
Java爬虫性能优化:多线程抓取JSP动态数据实践
|
2月前
|
存储 Oracle Java
|
3月前
|
数据采集 存储 JSON
地区电影市场分析:用Python爬虫抓取猫眼/灯塔专业版各地区票房
地区电影市场分析:用Python爬虫抓取猫眼/灯塔专业版各地区票房
|
6月前
|
数据采集 Web App开发 JavaScript
基于Selenium的Python爬虫抓取动态App图片
基于Selenium的Python爬虫抓取动态App图片
483 68
|
3月前
|
数据采集 存储 XML
Python爬虫XPath实战:电商商品ID的精准抓取策略
Python爬虫XPath实战:电商商品ID的精准抓取策略
|
4月前
|
Java
创建线程的方法
Java中实现多线程有四种方式:1. 继承Thread类,简单但占用继承机会,耦合度高;2. 实现Runnable接口,推荐方式,任务与线程解耦,支持Lambda;3. 实现Callable接口配合FutureTask,可获取返回值和异常;4. 使用线程池(ExecutorService),企业推荐,管理线程生命周期,提升性能,支持多种线程池类型。
132 1
|
6月前
|
数据采集
Haskell编程中,利用HTTP爬虫实现IP抓取
以上就是利用Haskell编写IP抓取爬虫的详细步骤。希望这篇文章的演示对于理解在Haskell这种函数式编程语言中如何实现网络爬虫有所帮助,而其中的网络访问、标签解析和列表处理等技术在许多其他的问题中都有广泛的应用。
157 26
|
5月前
|
Java 数据挖掘 调度
Java 多线程创建零基础入门新手指南:从零开始全面学习多线程创建方法
本文从零基础角度出发,深入浅出地讲解Java多线程的创建方式。内容涵盖继承`Thread`类、实现`Runnable`接口、使用`Callable`和`Future`接口以及线程池的创建与管理等核心知识点。通过代码示例与应用场景分析,帮助读者理解每种方式的特点及适用场景,理论结合实践,轻松掌握Java多线程编程 essentials。
378 5
|
6月前
|
数据采集 存储 前端开发
Python爬虫自动化:批量抓取网页中的A链接
Python爬虫自动化:批量抓取网页中的A链接