使用多线程获取某招聘网站的信息,使用环境py3,话不多说直接上代码
该导的包你可就不能少了
import threading
import requests
from pyquery import PyQuery as pq
import json
from pymongo import MongoClient
from fake_useragent import UserAgent
import time
from config import *
我这里用到的是Mongodb储存的,fake_useragent是一个‘User-Agent’一个共享库,看着用,我这边写的是一个配置文件,大家可以看将**文件写自己的文件。
client = MongoClient(().mogodb()["IP"],().mogodb()["port"])
u = UserAgent()
class data():
def __init__(self,startpage,page,citynum,hangyenum):
self.startpage=startpage
self.page=page
self.citynum=citynum
self.hangyenum=hangyenum
def datature(self):
if self.startpage==0:
self.startpage=1
for i in range(self.startpage,self.page):
try:
headers = {
'User-Agent': u.random,
}
yu = requests.get('这里是一个代理url').text
ip = yu.split(':')[0]
port = yu.split(':')[1]
proxy_http = "http://{0}:{1}".format(ip, port)
proxy_dict = {"http": proxy_http}
print(proxy_dict)
url='https://fe-api.zhaopin.com/c/i/sou?start={0}&pageSize=60&cityId={1}&industry={2}&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kt=3&_v=0.64455588&x-zp-page-request-id=2fb2f4e53284471aa21894bf291db027-1543801568582-396367'.format(str(i*60),self.citynum,self.hangyenum)
print(self.startpage,self.page)
r=requests.get(url,headers=headers,proxies=proxy_dict)
dict1=json.loads(r.text)
for j in dict1['data']['results']:
href='https://jobs.zhaopin.com/'+j['number']+'.htm'
href=j['positionURL']
print(href)
r1 = requests.get(href, headers=headers,proxies=proxy_dict)
doc = pq(r1.text)
title = doc('.new-info h1.l.info-h3').text()
xinzhi = doc('.new-info div.l.info-money strong').text()
company = doc('li.clearfix div.company.l a').text()
jianjie = doc('li.clearfix div.info-three.l').text()
zhiweixinxi = doc('div.l.pos-info-in div.pos-info-tit p.r ').text()
zhiweimiaoshu = doc('div.responsibility.pos-common div.pos-ul').text()
gongsigaikuang = doc('div.intro-content div[align="left"]').text()
lianxifangshi = doc('div.intro-content div[style="font-family: 宋体;"]').text()
list1=jianjie.split(' ')
try:
list2=zhiweixinxi.split(' ')[0]
except:
list2=''
try:
jianjie1=list1[0]
jianjie2=list1[1]
jianjie3=list1[2]
jianjie4=list1[3]
except:pass
if title!=""and len(list1)==4:
db = client.自己的数据库名1# 创建数据库名
post = {'title': title,
'xinzhi': xinzhi,
'conmpany': company,
'dizhi': jianjie1,
'jingyan':jianjie2,
'xueli':jianjie3,
'zhaopinrenshu':jianjie4,
'zhiweixinxi': list2,
'zhiweimiaoshu': zhiweimiaoshu,
'gongsigaikuang': gongsigaikuang,
'lianxifangshi': lianxifangshi,
'html':href
}
posts = db.zhaopin # 创建表名
post_id = posts.insert_one(post).inserted_id # 创建id将数据提交
else:
pass
print(i)
self.startpage+=1
except:
print("请求次数过多或程序出错错误页:",self.startpage)
self.startpage += 1
self.datature()
下面是使用多线程执行上面这段请求解析
def action(a,b,c,d):
start = data(int(a),int(b),c,d)
start.datature()
if __name__=="__main__":
while True:
print('88888888此程序启动为默认程序8888888888')
for i in range(26):
citynum = zhilianzhaopin().sheng(i)["citynum"]
for j in range(12):
hangyenum = zhilianzhaopin().hangye(j)["hangyenum"]
print('点击查看多少页,默认程序是101页:', end='')
print('https://sou.zhaopin.com/?jl={0}&in={1}'.format(citynum, hangyenum))
page = zhilianzhaopin().zongyeshu(101)[1]
time.sleep(5)
fenyepage = int(int(page) / 10)
duandianpage = fenyepage * 10
if fenyepage!=0:
for i in range(11):
if i <= 9:
t = threading.Thread(target=action, args=(i * fenyepage, (i + 1) * fenyepage,citynum,hangyenum))
else:
t = threading.Thread(target=action, args=(duandianpage, page,citynum,hangyenum))
t.start()
elif fenyepage==0:
action(1, page,citynum,hangyenum)
这下面是配置文件,可供参考:
class zhilianzhaopin():
def mogodb(self):
a={}
#mongodbIp
IP=''
#mongodb端口号
port=
a["IP"]=IP
a["port"]=port
if a["IP"]!="" and a["port"]!="" :
return a
else:
print("配置文件mongodb参数缺失")
#省会名
def sheng(self,cityxuhao):
list1={0: '北京', 1: '上海', 2: '深圳', 3: '广州', 4: '天津', 5: '成都', 6: '杭州', 7: '武汉', 8: '大连', 9: '长春', 10: '南京', 11: '济南',
12: '青岛', 13: '苏州', 14: '沈阳', 15: '西安', 16: '郑州', 17: '长沙', 18: '重庆', 19: '哈尔滨', 20: '无锡', 21: '宁波', 22: '福州',
23: '厦门', 24: '石家庄', 25: '合肥', 26: '惠州'}
a='''0: '北京', 1: '上海', 2: '深圳', 3: '广州', 4: '天津', 5: '成都', 6: '杭州', 7: '武汉', 8: '大连', 9: '长春' '''
b=''' 10: '南京', 11: '济南',12: '青岛', 13: '苏州', 14: '沈阳', 15: '西安', 16: '郑州', 17: '长沙', 18: '重庆', '''
c='''19: '哈尔滨', 20: '无锡', 21: '宁波', 22: '福州',23: '厦门', 24: '石家庄', 25: '合肥', 26: '惠州' '''
# print('%s\n%s\n%s'%(a,b,c))
# try:
# cityxuhao=int(input('输入序号,整数:'))
# except:
# print('输入错误!!!')
# self.sheng()
#城市名
city=list1[cityxuhao]
print(city)
#城市编码
dict1={}
citynum={'北京':'530','上海':'538','深圳':'765','广州':'763','天津':'537','成都':'801','杭州':'653','武汉':'736','大连':'600','长春':'631','南京':'635','济南':'702','青岛':'703','苏州':'639','沈阳':'599','西安':'854','郑州':'719','长沙':'749','重庆':'551','哈尔滨':'622','无锡':'636','宁波':'654','福州':'681','厦门':'682','石家庄':'515','合肥':'664','惠州':'773'}
dict1['citynum']=citynum[city]
if dict1['citynum']!="":
return dict1
else:
print("配置文件省会名参数出错")
#行业
def hangye(self,hangyexuhao):
#参考行业
list1={0: '互联网/IT', 1: '金融', 2: '房地产/建筑', 3: '商业服务', 4: '贸易/批发/零售', 5: '教育/艺术', 6: '服务业', 7: '文化/传媒/娱乐', 8: '制造业', 9: '物流运输', 10: '能源/环保', 11: '政府/非盈利', 12: '农林牧渔'}
a=''' 0: '互联网/IT', 1: '金融', 2: '房地产/建筑', 3: '商业服务', 4: '贸易/批发/零售', 5: '教育/艺术', 6: '服务业', 7: '文化/传媒/娱乐', 8: '制造业', 9: '物流运输' '''
b=''' 10: '能源/环保', 11: '政府/非盈利', 12: '农林牧渔' '''
# print('%s\n%s'%(a,b))
# hanyexuhao=hanyexuhao
# try:
# hangyexuhao=int(input('输入对应序号:'))
# except:
# print("输入错误!!!")
# self.hangye()
dict1={}
hangye = list1[hangyexuhao]
print(hangye)
# 行业编码
hangyenum = {"互联网/IT": "10100", "金融": "10200", "房地产/建筑": "10800", "商业服务": "10900", "贸易/批发/零售": "10300",
"教育/艺术": "10400", "服务业": "10000", "文化/传媒/娱乐": "11300", "制造业": "10500", "物流运输": "11500",
"能源/环保": "11600", "政府/非盈利": "11100", "农林牧渔": "11400"}
dict1["hangyenum"] = hangyenum[hangye]
if dict1["hangyenum"]!="":
return dict1
else:
print("配置文件行业中参数缺失")
#总页数
def zongyeshu(self,page):
startpage=1
try:
# page=int(input('输入总页数:'))
page=int(page)
except:
print('输入有误!!!')
self.zongyeshu()
if page!="":
return startpage,page
else:
print("配置文件总页数缺失")
有啥可以帮助的可以联系1307761253