1. 数据采集
首先,我们需要通过网络爬虫技术从招聘网站上获取数据。爬虫可以自动地访问网站并抓取所需的数据,例如职位信息、公司信息、薪资水平等。在选择爬虫工具时,需要考虑目标网站的结构和反爬虫机制,确保能够稳定高效地获取数据。
import csv import time import requests import execjs from storage.csv2mysql import sync_data2db def read_js_code(): f= open('/Users/shareit/workspace/chart_show/demo.js',encoding='utf-8') txt = f.read() js_code = execjs.compile(txt) ckId = js_code.call('r',32) return ckId def post_data(): read_js_code() url = "https://api-c.liepin.com/api/com.liepin.searchfront4c.pc-search-job" headers = { 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, br' } list = ["H01$H0001","H01$H0002", "H01$H0003","H01$H0004","H01$H0005", "H01$H0006","H01$H0007","H01$H0008", "H01$H0009","H01$H00010","H02$H0018","H02$H0019","H03$H0022", "H03$H0023","H03$H0024","H03$H0025","H04$H0030","H04$H0031", "H04$H0032","H05$H05","H06$H06","H07$H07","H08$H08"] for name in list: print("-------{}---------".format(name)) for i in range(10): print("------------第{}页-----------".format(i)) data = {"data": {"mainSearchPcConditionForm": {"city": "410", "dq": "410", "pubTime": "", "currentPage": i, "pageSize": 40, "key": "", "suggestTag": "", "workYearCode": "1", "compId": "", "compName": "", "compTag": "", "industry": name, "salary": "", "jobKind": "", "compScale": "", "compKind": "", "compStage": "", "eduLevel": ""}, "passThroughForm": {"scene": "page", "skId": "z33lm3jhwza7k1xjvcyn8lb8e9ghxx1b", "fkId": "z33lm3jhwza7k1xjvcyn8lb8e9ghxx1b", "ckId": read_js_code(), 'sfrom': 'search_job_pc'}}} response = requests.post(url=url, json=data, headers=headers) time.sleep(2) parse_data(response) def parse_data(response): try: jobCardList = response.json()['data']['data']['jobCardList'] sync_data2db(jobCardList) except Exception as e: return if __name__ == '__main__': post_data()
2. 数据预处理
获取到的原始数据往往杂乱无章,需要进行预处理才能进行后续的分析工作。预处理包括数据清洗、去重、缺失值处理、数据格式转换等环节,以确保数据的质量和一致性。在这一阶段,还可以利用自然语言处理技术对文本数据进行分词、词性标注等操作,为后续的分析提供更多维度的信息。然后将数据加载到hive中进行分析。
CREATE TABLE mydb.data ( id INT, title STRING, city STRING, salary STRING, campus_job_kind STRING, labels STRING, compName STRING, compIndustry STRING, compScale STRING ) COMMENT '数据表' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; LOAD DATA INPATH '/file.csv' OVERWRITE INTO TABLE mydb.data;
3. 数据分析
有了清洗和存储好的数据,接下来就是进行数据分析。数据分析的方法多种多样,可以根据具体的需求选择合适的分析技术和模型。常见的数据分析技术包括统计分析、机器学习、文本挖掘等。通过对
招聘数据的分析,我们可以发现人才市场的热点行业、热门职位、薪资水平等信息,为企业招聘决策提供参考。
def city_count_from_db(): tuple = [] try: with connection.cursor() as cursor: select_query = "select * from (select city,count(1) cnt FROM data group by city)a limit 10" cursor.execute(select_query) result = cursor.fetchall() for row in result: tuple.append((row['city'],row['cnt'])) except Exception as e: print(e) return tuple def salary_avg_from_db(): x=[] y=[] try: with connection.cursor() as cursor: select_query = "select * from (select city,avg(salary) avg FROM data group by city)a limit 20" cursor.execute(select_query) result = cursor.fetchall() for row in result: x.append(row['city']) y.append(int(row['avg'])) except Exception as e: print(e) return x,y def salary_industry_from_db(): x=[] y=[] try: with connection.cursor() as cursor: select_query = "select * from (select compIndustry,avg(salary) avg FROM data group by compIndustry)a limit 20" cursor.execute(select_query) result = cursor.fetchall() for row in result: x.append(row['compIndustry']) y.append(int(row['avg'])) except Exception as e: print(e) return x,y def salary_title_from_db(): x=[] y=[] try: with connection.cursor() as cursor: select_query = "select title,count(1) cnt from data group by title order by cnt desc limit 10" cursor.execute(select_query) result = cursor.fetchall() for row in result: x.append(row['title']) y.append(int(row['cnt'])) except Exception as e: print(e) return x,y def comany_from_db(): tuple = [] try: with connection.cursor() as cursor: select_query = "select compName,count(1) cnt FROM data group by compName order by cnt desc limit 10" cursor.execute(select_query) result = cursor.fetchall() for row in result: tuple.append((row['compName'], row['cnt'])) except Exception as e: print(e) return tuple
标题4. 数据可视化
最后,为了更直观地展示分析结果,我们利用Django框架搭建了数据可视化的平台。Django是一个高效的Web开发框架,通过它可以快速构建出美观、易用的数据可视化界面。在可视化界面上,我们可以展示招聘数据的各种统计图表、热点地图、词云等,帮助用户更直观地理解数据背后的信息。
def bar_chart(request): line = Line() x,y=salary_avg() line.add_xaxis(x) line.add_yaxis("全国应届毕业生就业城市薪资分布图", y) line1 = Line().set_global_opts( xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)), ) x, y = title_count() line1.add_xaxis(x) line1.add_yaxis("全国应届毕业生就业岗位分布图", y) # 创建条形图 bar = Bar().set_global_opts( xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)), ) x,y = industry_avg() bar.add_xaxis(x) bar.add_yaxis("全国应届毕业生就业领域薪资分布图", y) # 创建饼图 pie = Pie() tuple = city_top() pie.add("全国应届毕业生就业城市top10",tuple) pie1 = Pie() tuple1 = comany_count() pie1.add("全国应届毕业生就业公司top10",tuple1) # 获取图表的JavaScript代码 line_js = line.render_embed() bar_js = bar.render_embed() pie_js = pie.render_embed() line1_js = line1.render_embed() pie1_js = pie1.render_embed() return render(request, 'charts/bar_chart.html', {'line': line_js, 'bar': bar_js, 'pie': pie_js,'line1': line1_js,'pie1': pie1_js})
下面是数据分析的展示结果,喜欢的可以加个收藏点个关注哦,更多毕设相关内容,小编将持续分享哦