数据整理
import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.sans-serif']=['Microsoft YaHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号 from datetime import datetime plt.figure(figsize=(16,10)) import pyecharts.options as opts from pyecharts.charts import Line from pyecharts.faker import Faker from pyecharts.charts import Bar import os from pyecharts.options.global_options import ThemeType
ssqdata=pd.read_csv("getssq.csv") ssqdata.head()
1、修改表头
更改表头并去除无用数据:
ssqnames=['date','id','numbers','total','first','second'] ssqdata.to_excel('getssqv2.xlsx') ssqdatav2=pd.read_excel("getssqv2.xlsx",usecols='B:G',names=ssqnames) ssqdatav2.head()
更改日期类型:
ssqdatav2['date']=pd.to_datetime(ssqdatav2['date']) ssqdatav2.dtypes
2、增加时间列
# 增加辅助列 ssqdatav2['ssqyear']=ssqdatav2['date'].dt.year # 如果原来的数据不是 datetime64[ns]类型则不能使用这个函数 ssqdatav2['ssqmonth']=ssqdatav2['date'].dt.month # 月份 ssqdatav2['ssqquarter']='Q'+(ssqdatav2['date'].dt.quarter).apply(str) #不加这个 .apply(str)会报错 ssqdatav2['ssqym']=ssqdatav2['date'].apply(lambda x:x.strftime('%Y%m')) ssqdatav2['ssqyq']=ssqdatav2['date'].dt.to_period('Q') ssqdatav2['ssqseason']=ssqdatav2['ssqmonth'].apply(lambda x:'spring' if x<=3 else 'summer' if x<=6 else 'autumn' if x<=9 else 'winter') ssqdatav2.head()
3、分割获奖号码
# 如何分割获奖号码 import re numnames=['n01','n02','n03','n04','n05','n06','n07'] # first prize 一等奖 fpnames=['fpcounts','p01','p02','p03','p04'] ssqdatav2[numnames]=ssqdatav2['numbers'].str.split(' ',expand=True).replace() # 注意是两个空格 ssqdatav2.head(2)
4、使用正则表达式获取汉字
# 为分解firstprize定义函数 def fpp(x): if len(x)<=2: # 判断是否只有汉字,还是也有数字 return "待定" else: # 使用正则表达式获取中文 pattern="[\u4e00-\u9fa5]" pat=re.compile(pattern) return ''.join(pat.findall(x)) #使用fp() ssqdatav2['fpprovince']=ssqdatav2['first'].apply(lambda x:fpp(x)) ssqdatav2.head(310)
ssqdatav2.dtypes # total本来是销售额,如何将其格式转换为float或者int类型 # 自定义函数,将文本类型转换成数字类型 def t2f(x): return float(''.join(re.findall('\d+',x))) ssqdatav2['total2']=ssqdatav2['total'].apply(lambda x:t2f(x))
5、绘制透视图
# 透视 fig,axes=plt.subplots(2,4,figsize=(10,9)) ssqdatav2['ncount']=1 ssqdatav2.groupby(['n01'])['ncount'].count().plot(ax=axes[0,0]) ssqdatav2.groupby(['n02'])['ncount'].count().plot(ax=axes[0,1]) ssqdatav2.groupby(['n03'])['ncount'].count().plot(ax=axes[0,2]) ssqdatav2.groupby(['n04'])['ncount'].count().plot(ax=axes[0,3]) ssqdatav2.groupby(['n05'])['ncount'].count().plot(ax=axes[1,0]) ssqdatav2.groupby(['n06'])['ncount'].count().plot(ax=axes[1,1]) ssqdatav2.groupby(['n07'])['ncount'].count().plot(ax=axes[1,2])
显示每个号码当中购买频率分布: