1. 用到的技术

爬虫 ——> xpath

数据可视化 ——> matplotlib

2. 爬虫

1. 正常爬取

# -*- coding:UTF-8 -*-
import requests
from lxml import etree
import urllib
"""
正常爬取
爬取17173游戏排行榜前1500名
目的网站：
http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html
"""
def parse_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    return resp.text
"""
解析主页面 
获取主页面需要信息（排名，子页面链接，游戏名称，票数）
"""
def fetch(home_url):
    home_page = etree.HTML(parse_html(home_url))
    lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li')
    for li in lis:
        # 跳过广告
        if li.xpath('./@data-index') == 4:
            continue
        # 获取主页面需要信息（排名，子页面链接，游戏名称，票数）
        rank = li.xpath('./div/div[1]/em/text()')[0]
        child_url = "https:" + li.xpath('./div/div[2]/div/a/@href')[0]
        name = li.xpath('./div/div[2]/div/a/text()')[0]
        votes = li.xpath('./div/div[3]/text()')[0].strip()
        fetch_child(child_url, rank, name, votes)
        # 测试
        # print(
        #     rank,  # 1
        #     name,  # 暗黑破坏神4
        #     votes  # 93303
        # )
"""
解析子页面 
获取子页面需要信息（标签，类型，语言，开发商，注册，运营商）
"""
def fetch_child(child_url, rank, name, votes):
    child_page = etree.HTML(parse_html(child_url))
    game_label = "暂无"
    game_type = "暂无"
    game_language = "暂无"
    game_developer = "暂无"
    game_registered = "暂无"
    game_operator = "暂无"
    # 判断目标网页是否 因不存在而导致跳转到了其他页面
    if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0:
        game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()'))
        # print(game_label)  # ['PK', '虚幻引擎', 'TPS', '射击']
        # child_ul = child_page.xpath('//ul[@class="list-mater-info"]')
        # 判断游戏类型部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0:
            game_type = "暂无"
        else:
            game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0]
        game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()'))
        # 判断开发商部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0:
            game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0]
        else:
            game_developer = "暂无"
        # 判断注册部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2:
            game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0]
        else:
            a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0]
            game_registered = urllib.parse.unquote(a.rsplit("=")[-1])
        game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0]
    # 测试
    # print(
    #     game_label,  # PK,虚幻引擎,TPS,射击
    #     game_type,  # 第三人称射击
    #     game_language,  # 简体中文\英语\葡萄牙语\土耳其语
    #     game_developer,  # PUBG Corporation
    #     game_registered,  # 暂无
    #     game_operator,  # PUBG Corporation(中国)
    # )
    msg = [rank, name, votes, game_label, game_type, game_language, game_developer, game_registered, game_operator]
    writer(msg)
"""
信息写入文件
"""
def writer(msg):
    with open("gameRank.csv", mode="a", encoding="GBK", newline="") as f:
        print(msg[1], "开始")
        f.writelines(msg[0] + "," + msg[1] + "," + msg[2] + "," + msg[3] + "," + msg[4] + "," + msg[5] + "," + msg[6] + "," + msg[7] + "," + msg[8] + "\n")
        print(msg[1], "over！")
if __name__ == '__main__':
    for i in range(70):
        i = i + 1
        url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html"
        fetch(url)

2. 异步协程爬取

from lxml import etree
import urllib
import aiohttp
import aiofiles
import asyncio
"""
使用异步协程进行爬取
爬取17173游戏排行榜前1500名
目的网站：
http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html
"""
async def parse_html(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
    }
    async with aiohttp.ClientSession() as session:
        # 它的使用方式和requests相似，也是在get()方法中添加一个参数，但此时的参数名为proxy，
        # 参数值是字符串，且字符串中的代理协议，只支持http，写成https会报错。
        async with session.get(url, headers=headers) as resp:
            return await resp.text()
"""
函数目标 : 获取需要的信息
"""
async def fetch(child_url, rank, name, votes):
    child_page = etree.HTML(await parse_html(child_url))
    game_label = "暂无"
    game_type = "暂无"
    game_language = "暂无"
    game_developer = "暂无"
    game_registered = "暂无"
    game_operator = "暂无"
    # 判断目标网页是否 因不存在而导致跳转到了其他页面
    if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0:
        game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()'))
        # print(game_label)  # ['PK', '虚幻引擎', 'TPS', '射击']
        # child_ul = child_page.xpath('//ul[@class="list-mater-info"]')
        # 判断游戏类型部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0:
            game_type = "暂无"
        else:
            game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0]
        game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()'))
        # 判断开发商部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0:
            game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0]
        else:
            game_developer = "暂无"
        # 判断注册部分内容是否为空
        if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2:
            game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0]
        else:
            a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0]
            game_registered = urllib.parse.unquote(a.rsplit("=")[-1])
        game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0]
    # 测试
    print(
        rank,
        name,
        votes,
        game_label,  # PK,虚幻引擎,TPS,射击
        game_type,  # 第三人称射击
        game_language,  # 简体中文\英语\葡萄牙语\土耳其语
        game_developer,  # PUBG Corporation
        game_registered,  # 暂无
        game_operator,  # PUBG Corporation(中国)
    )
    async with aiofiles.open("gameRank.csv", mode="a", encoding="GBK", newline="") as f:
        print(name, "开始")
        await f.writelines(rank + "," + name + "," + votes + "," + game_label + "," + game_type + "," + game_language + "," + game_developer + "," + game_registered + "," + game_operator + "\n")
        print(name, "over！")
async def main():
    tasks = []
    home_page = etree.HTML(await parse_html(home_url))
    lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li')
    for li in lis:
        # 跳过广告
        if li.xpath('./@data-index') == 4:
            continue
        # 获取主页面需要信息（排名，子页面链接，游戏名称，票数）
        rank = li.xpath('./div/div[1]/em/text()')[0]
        a = "https:" + li.xpath('./div/div[2]/div/a/@href')[0]
        name = li.xpath('./div/div[2]/div/a/text()')[0]
        votes = li.xpath('./div/div[3]/text()')[0].strip()
        tasks.append(asyncio.ensure_future(fetch(a, rank, name, votes)))
    await asyncio.wait(tasks)
if __name__ == '__main__':
    # 创建事件循环
    loop = asyncio.get_event_loop()
    for i in range(10):
        home_url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html"
        loop.run_until_complete(main())
    loop.close()

3. 数据可视化

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
# 汉字字体,优先使用楷体，如果找不到楷体，则使用黑体
mpl.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong']
# 字体大小
mpl.rcParams['font.size'] = 12
# 正常显示负号
mpl.rcParams['axes.unicode_minus'] = False
# 使用ggplot的绘图风格，这个类似于美化
plt.style.use('ggplot')
# 读取csv文件
csv_data = pd.read_csv("../reptile/gameRank.csv", encoding="GBK")
# 1-游戏类型分布-柱状图
def picture01():
    # 获取 （类型）MMORPG   （该类游戏数量）489
    group_type = csv_data.groupby("type", as_index=False).size()
    # 获取 所有类型对应的游戏的 数量 一个数组 [489  56  14  86 ……]
    number_types = group_type["size"].values
    # 游戏类型
    x = group_type["type"].values
    # 类型对应的游戏的数量
    y = number_types
    # 画图：
    plt.bar(x, +y, facecolor='#ff9999', edgecolor='white')
    # 数据录入：zip指把x，y结合为一个整体，一次可以读取一个x和一个y
    for x, y in zip(x, y):
        plt.text(x, y, y, ha='center', va='bottom')  # 指字体在中间和柱最顶的顶部
    # 轴坐标 竖着显示
    plt.xticks(rotation=270)
    # 设置轴标签以及标题
    plt.xlabel("type")
    plt.ylabel("number")
    plt.title("1-各游戏类型条形图")
    plt.show()
# 2-游戏类型分布-饼图
def picture02():
    plt.axes(aspect='equal')  # 将横、纵坐标轴标准化处理，确保饼图是一个正圆，否则为椭圆
    # 以type为标准 分组 （type size）
    group_type = csv_data.groupby("type", as_index=False).size()
    # 游戏类型 数组
    type_types = group_type["type"].values
    # 游戏类型对应游戏的数量
    number_types = group_type["size"].values
    # 每种类型游戏的个数占比 数组
    pro_types = number_types / number_types.sum()
    plt.pie(
        x=pro_types,  # 绘图数据
        labels=type_types,  # 添加游戏类型标签
        autopct='%.2f%%',  # 设置百分比的格式，这里保留两位小数
        pctdistance=0.8,  # 设置百分比标签与圆心的距离
        labeldistance=1.05,  # 设置游戏类型标签与圆心的距离
        startangle=180,  # 设置饼图的初始角度
        radius=1.1,  # 设置饼图的半径
        counterclock=False,  # 是否逆时针，这里设置为顺时针方向
        wedgeprops={'linewidth': 1.5, 'edgecolor': 'green'},  # 设置饼图内外边界的属性值
        textprops={'fontsize': 5, 'color': 'black'},  # 设置文本标签的属性值
    )
    # 添加图标题
    plt.title('2-各大游戏类型分布')
    # 显示图形
    plt.show()
# 3-游戏隶属厂商TOP20-折线图
def picture03():
    # 数据准备
    group_dev = csv_data.groupby("developer", as_index=False).size()
    group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20]
    type_dev = group_dev["developer"].values
    number_dev = group_dev["size"].values
    x = type_dev
    y = number_dev
    plt.xticks(rotation=270)
    plt.title("3-各游戏隶属厂商TOP20")
    plt.plot(x, y)
    plt.show()
# 4-最火游戏类型top10-雷达图
def picture04():
    # 以type为标准 分组 （type size）
    group_type = csv_data.groupby("type", as_index=False).size()
    group_type = group_type.sort_values(by='size', axis=0, ascending=False)[0:10]
    # 游戏类型 数组
    type_types = group_type["type"].values
    # 游数戏类型对应游戏的量
    number_types = group_type["size"].values
    labels = type_types
    values = number_types
    # 设置每个数据点的显示位置，在雷达图上用角度表示
    angles = np.linspace(0, 2 * np.pi, len(values), endpoint=False)
    # 拼接数据首尾，使图形中线条封闭
    values = np.concatenate((values, [values[0]]))
    angles = np.concatenate((angles, [angles[0]]))
    labels = np.concatenate((labels, [labels[0]]))
    # 绘图
    fig = plt.figure()
    # 设置为极坐标格式
    ax = fig.add_subplot(111, polar=True)
    # 绘制折线图
    ax.plot(angles, values, 'o-', linewidth=2)
    # 填充颜色
    ax.fill(angles, values, alpha=0.25)
    # 设置图标上的角度划分刻度，为每个数据点处添加标签
    ax.set_thetagrids(angles * 180 / np.pi, labels)
    # 设置雷达图的范围
    ax.set_ylim(0, 500)
    # 添加标题
    plt.title('4-最火游戏类型top10')
    # 添加网格线
    ax.grid(True)
    plt.show()
# 5-最火游戏top20-横向柱状图
def picture05():
    data = csv_data.sort_values(by='popularity', axis=0, ascending=False)[0:20]
    # 游戏类型
    y = data["game_name"].values[::-1]
    # 类型对应的游戏的数量
    x = data["popularity"].values[::-1]
    # 图像绘制
    fig, ax = plt.subplots()
    b = ax.barh(range(len(y)), x, color='#ff9999')
    # 添加数据标签
    for rect in b:
        w = rect.get_width()
        ax.text(w, rect.get_y() + rect.get_height() / 2, '%d' % int(w), ha='left', va='center')
    # 设置Y轴刻度线标签
    ax.set_yticks(range(len(y)))
    ax.set_yticklabels(y)
    plt.title("5-最火游戏top20")
    plt.show()
# 6-公司游戏版权数量TOP20-横向柱状图
def picture06():
    # 数据准备
    group_dev = csv_data.groupby("developer", as_index=False).size()
    group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20]
    y = group_dev["developer"].values[::-1]
    # 类型对应的游戏的数量
    x = group_dev["size"].values[::-1]
    # 图像绘制
    fig, ax = plt.subplots()
    b = ax.barh(range(len(y)), x, color='#ff9999')
    # 添加数据标签
    for rect in b:
        w = rect.get_width()
        ax.text(w, rect.get_y() + rect.get_height() / 2, '%d' % int(w), ha='left', va='center')
    # 设置Y轴刻度线标签
    ax.set_yticks(range(len(y)))
    ax.set_yticklabels(y)
    plt.title("6-公司游戏版权数量TOP20")
    plt.show()
if __name__ == '__main__':
    picture01()
    # picture02()
    # picture03()
    # picture04()
    # picture05()
    # picture06()

Python爬虫学习——简单爬虫+可视化

1. 用到的技术

2. 爬虫

1. 正常爬取

2. 异步协程爬取

3. 数据可视化

热门文章

最新文章

相关课程

相关电子书

相关实验场景

推荐镜像

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

Python爬虫学习——简单爬虫+可视化

1. 用到的技术

2. 爬虫

1. 正常爬取

2. 异步协程爬取

3. 数据可视化

热门文章

最新文章

相关课程

相关电子书

相关实验场景

推荐镜像