import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("./data/HR.csv")
#获得以department分组后的索引值的数组dict
dp_indices = df.groupby("department").indices
sales_values = df["left"].iloc[dp_indices["sales"]].values
technical_values = df["left"].iloc[dp_indices["technical"]].values
print(ss.ttest_ind(sales_values,technical_values)[1])
dp_keys = list(dp_indices.keys())
dp_t_mat = np.zeros([len(dp_keys),len(dp_keys)])
for i in range(len(dp_keys)):
for j in range(len(dp_keys)):
p_value = ss.ttest_ind(df["left"].iloc[dp_indices[dp_keys[i]]].values,
df["left"].iloc[dp_indices[dp_keys[j]]].values)[1]
dp_t_mat[i][j] = p_value
sns.heatmap(dp_t_mat,xticklabels=dp_keys,yticklabels=dp_keys)
plt.show()
颜色越深的地方t值越接近于0,也就代表颜色越深的地方,二者的离职率是有显著差异的,而颜色淡的地方代表离职率没有显著差异
piv_tb = pd.pivot_table(df,values="left",index=["promotion_last_5years","salary"],
columns=["Work_accident"],aggfunc=np.mean)
print(piv_tb)
sns.heatmap(piv_tb,vmin=0,vmax=1)
plt.show()