# 独家 | 教你实现数据集多维可视化（附代码）

“一图胜千言。”

“一张图最大的价值在于它使我们注意到我们没有预料到的东西。”

— John Tukey

import pandas as pd

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

import matplotlib as mpl

import numpy as np

import seaborn as sns

%matplotlib inline

white_wine = pd.read_csv('winequality-white.csv', sep=';')

# store wine type as an attribute

red_wine['wine_type'] = 'red'

white_wine['wine_type'] = 'white'

# bucket wine quality scores into qualitative quality labels

red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'

if value <= 5 else 'medium'

if value <= 7 else 'high')

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],

categories=['low', 'medium', 'high'])

white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'

if value <= 5 else 'medium'

if value <= 7 else 'high')

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],

categories=['low', 'medium', 'high'])

# merge red and white wine datasets

wines = pd.concat([red_wine, white_wine])

# re-shuffle records just to randomize data points

wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)

The wine quality dataset

subset_attributes = ['residual sugar', 'total sulfur dioxide', 'sulphates',
'alcohol', 'volatile acidity', 'quality']
rs = round(red_wine[subset_attributes].describe(),2)
ws = round(white_wine[subset_attributes].describe(),2)

pd.concat([rs, ws], axis=1, keys=['Red Wine Statistics', 'White Wine Statistics'])

wines.hist(bins=15,color='steelblue', edgecolor='black', linewidth=1.0,

xlabelsize=8, ylabelsize=8,grid=False)

plt.tight_layout(rect=(0,0, 1.2, 1.2))

# Histogram

fig = plt.figure(figsize = (6,4))

title = fig.suptitle("Sulphates Content in Wine", fontsize=14)

ax.set_xlabel("Sulphates")

ax.set_ylabel("Frequency")

ax.text(1.2, 800, r'$\mu$='+str(round(wines['sulphates'].mean(),2)),

fontsize=12)

freq, bins, patches = ax.hist(wines['sulphates'], color='steelblue', bins=15,

edgecolor='black', linewidth=1)

# Density Plot

fig = plt.figure(figsize = (6, 4))

title = fig.suptitle("Sulphates Content in Wine", fontsize=14)

ax1.set_xlabel("Sulphates")

ax1.set_ylabel("Frequency")

sns.kdeplot(wines['sulphates'], ax=ax1, shade=True, color='steelblue')

# Correlation Matrix Heatmap

f, ax = plt.subplots(figsize=(10, 6))

corr = wines.corr()

hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f',

linewidths=.05)

t= f.suptitle('Wine Attributes Correlation Heatmap', fontsize=14)

# Scatter Plot

plt.scatter(wines['sulphates'], wines['alcohol'],

alpha=0.4, edgecolors='w')

plt.xlabel('Sulphates')

plt.ylabel('Alcohol')

plt.title('Wine Sulphates - Alcohol Content',y=1.05)

# Joint Plot

jp = sns.jointplot(x='sulphates', y='alcohol', data=wines,

kind='reg', space=0, size=5, ratio=4)

# Using subplots or facets along with Bar Plots

fig = plt.figure(figsize = (10, 4))

title = fig.suptitle("Wine Type - Quality", fontsize=14)

# red wine - wine quality

ax1.set_title("Red Wine")

ax1.set_xlabel("Quality")

ax1.set_ylabel("Frequency")

rw_q = red_wine['quality'].value_counts()

rw_q = (list(rw_q.index), list(rw_q.values))

ax1.set_ylim([0, 2500])

ax1.tick_params(axis='both', which='major', labelsize=8.5)

bar1 = ax1.bar(rw_q[0], rw_q[1], color='red',

edgecolor='black', linewidth=1)

# white wine - wine quality

ax2.set_title("White Wine")

ax2.set_xlabel("Quality")

ax2.set_ylabel("Frequency")

ww_q = white_wine['quality'].value_counts()

ww_q = (list(ww_q.index), list(ww_q.values))

ax2.set_ylim([0, 2500])

ax2.tick_params(axis='both', which='major', labelsize=8.5)

bar2 = ax2.bar(ww_q[0], ww_q[1], color='white',

edgecolor='black', linewidth=1)

# Multi-bar Plot

cp = sns.countplot(x="quality", hue="wine_type", data=wines,

palette={"red": "#FF9999", "white": "#FFE888"})

# facets with histograms

fig = plt.figure(figsize = (10,4))

title = fig.suptitle("Sulphates Content in Wine", fontsize=14)

ax1.set_title("Red Wine")

ax1.set_xlabel("Sulphates")

ax1.set_ylabel("Frequency")

ax1.set_ylim([0, 1200])

ax1.text(1.2, 800, r'$\mu$='+str(round(red_wine['sulphates'].mean(),2)),

fontsize=12)

r_freq, r_bins, r_patches = ax1.hist(red_wine['sulphates'], color='red', bins=15,

edgecolor='black', linewidth=1)

ax2.set_title("White Wine")

ax2.set_xlabel("Sulphates")

ax2.set_ylabel("Frequency")

ax2.set_ylim([0, 1200])

ax2.text(0.8, 800, r'$\mu$='+str(round(white_wine['sulphates'].mean(),2)),

fontsize=12)

w_freq, w_bins, w_patches = ax2.hist(white_wine['sulphates'], color='white', bins=15,

edgecolor='black', linewidth=1)

# facets with density plots

fig = plt.figure(figsize = (10, 4))

title = fig.suptitle("Sulphates Content in Wine", fontsize=14)

ax1.set_title("Red Wine")

ax1.set_xlabel("Sulphates")

ax1.set_ylabel("Density")

ax2.set_title("White Wine")

ax2.set_xlabel("Sulphates")

ax2.set_ylabel("Density")

sns.kdeplot(white_wine['sulphates'], ax=ax2, shade=True, color='y')

# Using multiple Histograms

fig = plt.figure(figsize = (6, 4))

title = fig.suptitle("Sulphates Content in Wine", fontsize=14)

ax.set_xlabel("Sulphates")

ax.set_ylabel("Frequency")

g = sns.FacetGrid(wines, hue='wine_type', palette={"red": "r", "white": "y"})

g.map(sns.distplot, 'sulphates', kde=False, bins=15, ax=ax)

ax.legend(title='Wine Type')

plt.close(2)

# Box Plots

f, (ax) = plt.subplots(1, 1, figsize=(12, 4))

f.suptitle('Wine Quality - Alcohol Content', fontsize=14)

sns.boxplot(x="quality", y="alcohol", data=wines, ax=ax)

ax.set_xlabel("Wine Quality",size = 12,alpha=0.8)

ax.set_ylabel("Wine Alcohol %",size = 12,alpha=0.8)

# Violin Plots

f, (ax) = plt.subplots(1, 1, figsize=(12, 4))

f.suptitle('Wine Quality - Sulphates Content', fontsize=14)

sns.violinplot(x="quality", y="sulphates", data=wines, ax=ax)

ax.set_xlabel("Wine Quality",size = 12,alpha=0.8)

ax.set_ylabel("Wine Sulphates",size = 12,alpha=0.8)

# Scatter Plot with Hue for visualizing data in 3-D

cols = ['density', 'residual sugar', 'total sulfur dioxide', 'fixed acidity', 'wine_type']

pp = sns.pairplot(wines[cols], hue='wine_type', size=1.8, aspect=1.8,

palette={"red": "#FF9999", "white": "#FFE888"},

plot_kws=dict(edgecolor="black", linewidth=0.5))

fig = pp.fig

t = fig.suptitle('Wine Attributes Pairwise Plots', fontsize=14)

# Visualizing 3-D numeric data with Scatter Plots

fig = plt.figure(figsize=(8, 6))

xs = wines['residual sugar']

ys = wines['fixed acidity']

zs = wines['alcohol']

ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w')

ax.set_xlabel('Residual Sugar')

ax.set_ylabel('Fixed Acidity')

ax.set_zlabel('Alcohol')

# Visualizing 3-D numeric data with a bubble chart

plt.scatter(wines['fixed acidity'], wines['alcohol'], s=wines['residual sugar']*25,

alpha=0.4, edgecolors='w')

plt.xlabel('Fixed Acidity')

plt.ylabel('Alcohol')

plt.title('Wine Alcohol Content - Fixed Acidity - Residual Sugar',y=1.05)

# Visualizing 3-D categorical data using bar plots

# leveraging the concepts of hue and facets

fc = sns.factorplot(x="quality", hue="wine_type", col="quality_label",

data=wines, kind="count",

palette={"red": "#FF9999", "white": "#FFE888"})

# Visualizing 3-D mix data using scatter plots

# leveraging the concepts of hue for categorical dimension

jp = sns.pairplot(wines, x_vars=["sulphates"], y_vars=["alcohol"], size=4.5,

hue="wine_type", palette={"red": "#FF9999", "white": "#FFE888"},

plot_kws=dict(edgecolor="k", linewidth=0.5))

# we can also view relationships\correlations as needed

lp = sns.lmplot(x='sulphates', y='alcohol', hue='wine_type',

palette={"red": "#FF9999", "white": "#FFE888"},

data=wines, fit_reg=True, legend=True,

scatter_kws=dict(edgecolor="k", linewidth=0.5))

# Visualizing 3-D mix data using kernel density plots

# leveraging the concepts of hue for categorical dimension

ax = sns.kdeplot(white_wine['sulphates'], white_wine['alcohol'],

ax = sns.kdeplot(red_wine['sulphates'], red_wine['alcohol'],

cmap="Reds", shade=True, shade_lowest=False) 

# Visualizing 3-D mix data using violin plots

# leveraging the concepts of hue and axes for > 1 categorical dimensions

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))

f.suptitle('Wine Type - Quality - Acidity', fontsize=14)

sns.violinplot(x="quality", y="volatile acidity",

data=wines, inner="quart", linewidth=1.3,ax=ax1)

ax1.set_xlabel("Wine Quality",size = 12,alpha=0.8)

ax1.set_ylabel("Wine Volatile Acidity",size = 12,alpha=0.8)

sns.violinplot(x="quality", y="volatile acidity", hue="wine_type",

data=wines, split=True, inner="quart", linewidth=1.3,

palette={"red": "#FF9999", "white": "white"}, ax=ax2)

ax2.set_xlabel("Wine Quality",size = 12,alpha=0.8)

ax2.set_ylabel("Wine Volatile Acidity",size = 12,alpha=0.8)

l = plt.legend(loc='upper right', title='Wine Type') 

# Visualizing 3-D mix data using box plots

# leveraging the concepts of hue and axes for > 1 categorical dimensions

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))

f.suptitle('Wine Type - Quality - Alcohol Content', fontsize=14)

sns.boxplot(x="quality", y="alcohol", hue="wine_type",

data=wines, palette={"red": "#FF9999", "white": "white"}, ax=ax1)

ax1.set_xlabel("Wine Quality",size = 12,alpha=0.8)

ax1.set_ylabel("Wine Alcohol %",size = 12,alpha=0.8)

sns.boxplot(x="quality_label", y="alcohol", hue="wine_type",

data=wines, palette={"red": "#FF9999", "white": "white"}, ax=ax2)

ax2.set_xlabel("Wine Quality Class",size = 12,alpha=0.8)

ax2.set_ylabel("Wine Alcohol %",size = 12,alpha=0.8)

l = plt.legend(loc='best', title='Wine Type')


# Visualizing 4-D mix data using scatter plots

# leveraging the concepts of hue and depth

fig = plt.figure(figsize=(8, 6))

t = fig.suptitle('Wine Residual Sugar - Alcohol Content - Acidity - Type', fontsize=14)

xs = list(wines['residual sugar'])

ys = list(wines['alcohol'])

zs = list(wines['fixed acidity'])

data_points = [(x, y, z) for x, y, z in zip(xs, ys, zs)]

colors = ['red' if wt == 'red' else 'yellow' for wt in list(wines['wine_type'])]

for data, color in zip(data_points, colors):

x, y, z = data

ax.scatter(x, y, z, alpha=0.4, c=color, edgecolors='none', s=30)

ax.set_xlabel('Residual Sugar')

ax.set_ylabel('Alcohol')

ax.set_zlabel('Fixed Acidity') 

# Visualizing 4-D mix data using bubble plots

# leveraging the concepts of hue and size

size = wines['residual sugar']*25

fill_colors = ['#FF9999' if wt=='red' else '#FFE888' for wt in list(wines['wine_type'])]

edge_colors = ['red' if wt=='red' else 'orange' for wt in list(wines['wine_type'])]

plt.scatter(wines['fixed acidity'], wines['alcohol'], s=size,

alpha=0.4, color=fill_colors, edgecolors=edge_colors)

plt.xlabel('Fixed Acidity')

plt.ylabel('Alcohol')

plt.title('Wine Alcohol Content - Fixed Acidity - Residual Sugar - Type',

# Visualizing 5-D mix data using bubble charts

# leveraging the concepts of hue, size and depth

fig = plt.figure(figsize=(8, 6))

t = fig.suptitle('Wine Residual Sugar - Alcohol Content - Acidity - Total Sulfur Dioxide - Type', fontsize=14)

xs = list(wines['residual sugar'])

ys = list(wines['alcohol'])

zs = list(wines['fixed acidity'])

data_points = [(x, y, z) for x, y, z in zip(xs, ys, zs)]

ss = list(wines['total sulfur dioxide'])

colors = ['red' if wt == 'red' else 'yellow' for wt in list(wines['wine_type'])]

for data, color, size in zip(data_points, colors, ss):

x, y, z = data

ax.scatter(x, y, z, alpha=0.4, c=color, edgecolors='none', s=size)

ax.set_xlabel('Residual Sugar')

ax.set_ylabel('Alcohol')

ax.set_zlabel('Fixed Acidity')

# Visualizing 6-D mix data using scatter charts

# leveraging the concepts of hue, size, depth and shape

fig = plt.figure(figsize=(8, 6))

t = fig.suptitle('Wine Residual Sugar - Alcohol Content - Acidity - Total Sulfur Dioxide - Type - Quality', fontsize=14)

xs = list(wines['residual sugar'])

ys = list(wines['alcohol'])

zs = list(wines['fixed acidity'])

data_points = [(x, y, z) for x, y, z in zip(xs, ys, zs)]

ss = list(wines['total sulfur dioxide'])

colors = ['red' if wt == 'red' else 'yellow' for wt in list(wines['wine_type'])]

markers = [',' if q == 'high' else 'x' if q == 'medium' else 'o' for q in list(wines['quality_label'])]

for data, color, size, mark in zip(data_points, colors, ss, markers):

x, y, z = data

ax.scatter(x, y, z, alpha=0.4, c=color, edgecolors='none', s=size, marker=mark)

ax.set_xlabel('Residual Sugar')

ax.set_ylabel('Alcohol')

ax.set_zlabel('Fixed Acidity')

1.考虑形状和Y轴，与低品质的葡萄酒相比，中高品质的葡萄酒酒精含量更高；

2.考虑色调和大小，白葡萄酒的总二氧化硫含量比红葡萄酒要高。

3.考虑深度和色调，与红葡萄酒相比，白葡萄酒具有较低的固定酸度。

4.考虑色调和x轴，红葡萄酒与白葡萄酒相比残糖量较低。

5.考虑色调和形状，白葡萄酒与红葡萄酒相比似乎具有更高的品质（可能是由于白葡萄酒的样本量较大）。

# Visualizing 6-D mix data using scatter charts

# leveraging the concepts of hue, facets and size

g = sns.FacetGrid(wines, row='wine_type', col="quality", hue='quality_label', size=4)

g.map(plt.scatter, "residual sugar", "alcohol", alpha=0.5,

edgecolor='k', linewidth=0.5, s=wines['total sulfur dioxide']*2)

fig = g.fig

fig.set_size_inches(18, 8)

fig.suptitle('Wine Type - Sulfur Dioxide - Residual Sugar - Alcohol - Quality Class - Quality Rating', fontsize=14)

l = g.add_legend(title='Wine Quality Class') 

