1 多线程抓取
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
areaDict = {}
for area in areaList:
areaName = area.xpath('./text()')[0]
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
for house in sellList:
title = house.xpath('.//div[@class="title"]/a/text()')[0]
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
house.xpath('.//div[@class="houseInfo"]/text()')[0]
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
with rlock:
print(areaName)
with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
threadList = []
for areaName, url in areaDict.items():
t = threading.Thread(target=gethouseInfo, args=(areaName, url))
threadList.append(t)
t.start()
for i in threadList:
i.join()
print(time.clock())
2 多协程抓取
import gevent
from gevent import monkey
gevent.monkey.patch_all()
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
areaDict = {}
for area in areaList:
areaName = area.xpath('./text()')[0]
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
for house in sellList:
title = house.xpath('.//div[@class="title"]/a/text()')[0]
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
house.xpath('.//div[@class="houseInfo"]/text()')[0]
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
geventList = []
for k, v in areaDict.items():
g = gevent.spawn(gethouseInfo, k, v)
geventList.append(g)
gevent.joinall(geventList)
print(time.clock())
3 多进程抓取
import lxml
from lxml import etree
import requests
import multiprocessing
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
areaDict = {}
for area in areaList:
areaName = area.xpath('./text()')[0]
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
for house in sellList:
title = house.xpath('.//div[@class="title"]/a/text()')[0]
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
house.xpath('.//div[@class="houseInfo"]/text()')[0]
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
processList = []
for areaName, url in areaDict.items():
t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url))
processList.append(t)
t.start()
for i in processList:
i.join()
print(time.clock())
4 多线程加协程
import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
areaDict = {}
for area in areaList:
areaName = area.xpath('./text()')[0]
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
for house in sellList:
title = house.xpath('.//div[@class="title"]/a/text()')[0]
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
house.xpath('.//div[@class="houseInfo"]/text()')[0]
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
with rlock:
print(areaName)
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
f.flush()
def getPageNum(areaName, url):
'''
获取当前页面
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageNum = json.loads(pageNum)
pageNum = pageNum['totalPage']
geventList = []
for i in range(1, int(pageNum) + 1):
newurl = url + "pg%d/" % i
g = gevent.spawn(gethouseInfo, areaName, newurl)
geventList.append(g)
gevent.joinall(geventList)
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
threadList = []
for areaName, url in areaDict.items():
t = threading.Thread(target=getPageNum, args=(areaName, url))
threadList.append(t)
t.start()
for i in threadList:
i.join()
print(time.clock())
5 多进程加协程
import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json
import lxml
from lxml import etree
import requests
import multiprocessing
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
areaDict = {}
for area in areaList:
areaName = area.xpath('./text()')[0]
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
for house in sellList:
title = house.xpath('.//div[@class="title"]/a/text()')[0]
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
house.xpath('.//div[@class="houseInfo"]/text()')[0]
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
print(areaName)
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
f.flush()
def getPageNum(areaName, url):
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageNum = json.loads(pageNum)
pageNum = pageNum['totalPage']
geventList = []
for i in range(1, int(pageNum) + 1):
newurl = url + "pg%d/" % i
g = gevent.spawn(gethouseInfo, areaName, newurl)
geventList.append(g)
gevent.joinall(geventList)
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
processList = []
for areaName, url in areaDict.items():
p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
processList.append(p)
p.start()
for i in processList:
i.join()
print(time.clock())