1. 用到的技术
爬虫 ——> xpath
数据可视化 ——> matplotlib
2. 爬虫
1. 正常爬取
# -*- coding:UTF-8 -*- import requests from lxml import etree import urllib """ 正常爬取 爬取17173游戏排行榜前1500名 目的网站: http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html """ def parse_html(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" } resp = requests.get(url, headers=headers) return resp.text """ 解析主页面 获取主页面需要信息(排名,子页面链接,游戏名称,票数) """ def fetch(home_url): home_page = etree.HTML(parse_html(home_url)) lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li') for li in lis: # 跳过广告 if li.xpath('./@data-index') == 4: continue # 获取主页面需要信息(排名,子页面链接,游戏名称,票数) rank = li.xpath('./div/div[1]/em/text()')[0] child_url = "https:" + li.xpath('./div/div[2]/div/a/@href')[0] name = li.xpath('./div/div[2]/div/a/text()')[0] votes = li.xpath('./div/div[3]/text()')[0].strip() fetch_child(child_url, rank, name, votes) # 测试 # print( # rank, # 1 # name, # 暗黑破坏神4 # votes # 93303 # ) """ 解析子页面 获取子页面需要信息(标签,类型,语言,开发商,注册,运营商) """ def fetch_child(child_url, rank, name, votes): child_page = etree.HTML(parse_html(child_url)) game_label = "暂无" game_type = "暂无" game_language = "暂无" game_developer = "暂无" game_registered = "暂无" game_operator = "暂无" # 判断目标网页是否 因不存在而导致跳转到了其他页面 if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0: game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()')) # print(game_label) # ['PK', '虚幻引擎', 'TPS', '射击'] # child_ul = child_page.xpath('//ul[@class="list-mater-info"]') # 判断游戏类型部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0: game_type = "暂无" else: game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0] game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()')) # 判断开发商部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0: game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0] else: game_developer = "暂无" # 判断注册部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2: game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0] else: a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0] game_registered = urllib.parse.unquote(a.rsplit("=")[-1]) game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0] # 测试 # print( # game_label, # PK,虚幻引擎,TPS,射击 # game_type, # 第三人称射击 # game_language, # 简体中文\英语\葡萄牙语\土耳其语 # game_developer, # PUBG Corporation # game_registered, # 暂无 # game_operator, # PUBG Corporation(中国) # ) msg = [rank, name, votes, game_label, game_type, game_language, game_developer, game_registered, game_operator] writer(msg) """ 信息写入文件 """ def writer(msg): with open("gameRank.csv", mode="a", encoding="GBK", newline="") as f: print(msg[1], "开始") f.writelines(msg[0] + "," + msg[1] + "," + msg[2] + "," + msg[3] + "," + msg[4] + "," + msg[5] + "," + msg[6] + "," + msg[7] + "," + msg[8] + "\n") print(msg[1], "over!") if __name__ == '__main__': for i in range(70): i = i + 1 url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html" fetch(url)
2. 异步协程爬取
from lxml import etree import urllib import aiohttp import aiofiles import asyncio """ 使用异步协程进行爬取 爬取17173游戏排行榜前1500名 目的网站: http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html """ async def parse_html(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', } async with aiohttp.ClientSession() as session: # 它的使用方式和requests相似,也是在get()方法中添加一个参数,但此时的参数名为proxy, # 参数值是字符串,且字符串中的代理协议,只支持http,写成https会报错。 async with session.get(url, headers=headers) as resp: return await resp.text() """ 函数目标 : 获取需要的信息 """ async def fetch(child_url, rank, name, votes): child_page = etree.HTML(await parse_html(child_url)) game_label = "暂无" game_type = "暂无" game_language = "暂无" game_developer = "暂无" game_registered = "暂无" game_operator = "暂无" # 判断目标网页是否 因不存在而导致跳转到了其他页面 if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0: game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()')) # print(game_label) # ['PK', '虚幻引擎', 'TPS', '射击'] # child_ul = child_page.xpath('//ul[@class="list-mater-info"]') # 判断游戏类型部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0: game_type = "暂无" else: game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0] game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()')) # 判断开发商部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0: game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0] else: game_developer = "暂无" # 判断注册部分内容是否为空 if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2: game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0] else: a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0] game_registered = urllib.parse.unquote(a.rsplit("=")[-1]) game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0] # 测试 print( rank, name, votes, game_label, # PK,虚幻引擎,TPS,射击 game_type, # 第三人称射击 game_language, # 简体中文\英语\葡萄牙语\土耳其语 game_developer, # PUBG Corporation game_registered, # 暂无 game_operator, # PUBG Corporation(中国) ) async with aiofiles.open("gameRank.csv", mode="a", encoding="GBK", newline="") as f: print(name, "开始") await f.writelines(rank + "," + name + "," + votes + "," + game_label + "," + game_type + "," + game_language + "," + game_developer + "," + game_registered + "," + game_operator + "\n") print(name, "over!") async def main(): tasks = [] home_page = etree.HTML(await parse_html(home_url)) lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li') for li in lis: # 跳过广告 if li.xpath('./@data-index') == 4: continue # 获取主页面需要信息(排名,子页面链接,游戏名称,票数) rank = li.xpath('./div/div[1]/em/text()')[0] a = "https:" + li.xpath('./div/div[2]/div/a/@href')[0] name = li.xpath('./div/div[2]/div/a/text()')[0] votes = li.xpath('./div/div[3]/text()')[0].strip() tasks.append(asyncio.ensure_future(fetch(a, rank, name, votes))) await asyncio.wait(tasks) if __name__ == '__main__': # 创建事件循环 loop = asyncio.get_event_loop() for i in range(10): home_url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html" loop.run_until_complete(main()) loop.close()
3. 数据可视化
import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np import pandas as pd # 汉字字体,优先使用楷体,如果找不到楷体,则使用黑体 mpl.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong'] # 字体大小 mpl.rcParams['font.size'] = 12 # 正常显示负号 mpl.rcParams['axes.unicode_minus'] = False # 使用ggplot的绘图风格,这个类似于美化 plt.style.use('ggplot') # 读取csv文件 csv_data = pd.read_csv("../reptile/gameRank.csv", encoding="GBK") # 1-游戏类型分布-柱状图 def picture01(): # 获取 (类型)MMORPG (该类游戏数量)489 group_type = csv_data.groupby("type", as_index=False).size() # 获取 所有类型对应的游戏的 数量 一个数组 [489 56 14 86 ……] number_types = group_type["size"].values # 游戏类型 x = group_type["type"].values # 类型对应的游戏的数量 y = number_types # 画图: plt.bar(x, +y, facecolor='#ff9999', edgecolor='white') # 数据录入:zip指把x,y结合为一个整体,一次可以读取一个x和一个y for x, y in zip(x, y): plt.text(x, y, y, ha='center', va='bottom') # 指字体在中间和柱最顶的顶部 # 轴坐标 竖着显示 plt.xticks(rotation=270) # 设置轴标签以及标题 plt.xlabel("type") plt.ylabel("number") plt.title("1-各游戏类型条形图") plt.show() # 2-游戏类型分布-饼图 def picture02(): plt.axes(aspect='equal') # 将横、纵坐标轴标准化处理,确保饼图是一个正圆,否则为椭圆 # 以type为标准 分组 (type size) group_type = csv_data.groupby("type", as_index=False).size() # 游戏类型 数组 type_types = group_type["type"].values # 游戏类型对应游戏的数量 number_types = group_type["size"].values # 每种类型游戏的个数占比 数组 pro_types = number_types / number_types.sum() plt.pie( x=pro_types, # 绘图数据 labels=type_types, # 添加游戏类型标签 autopct='%.2f%%', # 设置百分比的格式,这里保留两位小数 pctdistance=0.8, # 设置百分比标签与圆心的距离 labeldistance=1.05, # 设置游戏类型标签与圆心的距离 startangle=180, # 设置饼图的初始角度 radius=1.1, # 设置饼图的半径 counterclock=False, # 是否逆时针,这里设置为顺时针方向 wedgeprops={'linewidth': 1.5, 'edgecolor': 'green'}, # 设置饼图内外边界的属性值 textprops={'fontsize': 5, 'color': 'black'}, # 设置文本标签的属性值 ) # 添加图标题 plt.title('2-各大游戏类型分布') # 显示图形 plt.show() # 3-游戏隶属厂商TOP20-折线图 def picture03(): # 数据准备 group_dev = csv_data.groupby("developer", as_index=False).size() group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20] type_dev = group_dev["developer"].values number_dev = group_dev["size"].values x = type_dev y = number_dev plt.xticks(rotation=270) plt.title("3-各游戏隶属厂商TOP20") plt.plot(x, y) plt.show() # 4-最火游戏类型top10-雷达图 def picture04(): # 以type为标准 分组 (type size) group_type = csv_data.groupby("type", as_index=False).size() group_type = group_type.sort_values(by='size', axis=0, ascending=False)[0:10] # 游戏类型 数组 type_types = group_type["type"].values # 游数戏类型对应游戏的量 number_types = group_type["size"].values labels = type_types values = number_types # 设置每个数据点的显示位置,在雷达图上用角度表示 angles = np.linspace(0, 2 * np.pi, len(values), endpoint=False) # 拼接数据首尾,使图形中线条封闭 values = np.concatenate((values, [values[0]])) angles = np.concatenate((angles, [angles[0]])) labels = np.concatenate((labels, [labels[0]])) # 绘图 fig = plt.figure() # 设置为极坐标格式 ax = fig.add_subplot(111, polar=True) # 绘制折线图 ax.plot(angles, values, 'o-', linewidth=2) # 填充颜色 ax.fill(angles, values, alpha=0.25) # 设置图标上的角度划分刻度,为每个数据点处添加标签 ax.set_thetagrids(angles * 180 / np.pi, labels) # 设置雷达图的范围 ax.set_ylim(0, 500) # 添加标题 plt.title('4-最火游戏类型top10') # 添加网格线 ax.grid(True) plt.show() # 5-最火游戏top20-横向柱状图 def picture05(): data = csv_data.sort_values(by='popularity', axis=0, ascending=False)[0:20] # 游戏类型 y = data["game_name"].values[::-1] # 类型对应的游戏的数量 x = data["popularity"].values[::-1] # 图像绘制 fig, ax = plt.subplots() b = ax.barh(range(len(y)), x, color='#ff9999') # 添加数据标签 for rect in b: w = rect.get_width() ax.text(w, rect.get_y() + rect.get_height() / 2, '%d' % int(w), ha='left', va='center') # 设置Y轴刻度线标签 ax.set_yticks(range(len(y))) ax.set_yticklabels(y) plt.title("5-最火游戏top20") plt.show() # 6-公司游戏版权数量TOP20-横向柱状图 def picture06(): # 数据准备 group_dev = csv_data.groupby("developer", as_index=False).size() group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20] y = group_dev["developer"].values[::-1] # 类型对应的游戏的数量 x = group_dev["size"].values[::-1] # 图像绘制 fig, ax = plt.subplots() b = ax.barh(range(len(y)), x, color='#ff9999') # 添加数据标签 for rect in b: w = rect.get_width() ax.text(w, rect.get_y() + rect.get_height() / 2, '%d' % int(w), ha='left', va='center') # 设置Y轴刻度线标签 ax.set_yticks(range(len(y))) ax.set_yticklabels(y) plt.title("6-公司游戏版权数量TOP20") plt.show() if __name__ == '__main__': picture01() # picture02() # picture03() # picture04() # picture05() # picture06()