Chp7-4
2019 年 12 月 23 日
In [32]: import numpy as np import pandas as pd import random from scipy import stats from matplotlib import pyplot as plt x1=[] x2=[] y1=[] y2=[] for n in range(40): x1.append(random.gauss(-1,0.25)) x2.append(random.gauss(-1,0.25)) y1.append(random.gauss(1,0.25)) y2.append(random.gauss(1,0.25)) plt.figure(figsize=(8,6)) plt.plot(x1,y1,'og') plt.plot(x1,x2,'og') plt.plot(y1,x1,'og') plt.plot(y1,y2,'og') plt.xlim([-2.5,2.5]) plt.ylim([-2.5,2.5]) plt.savefig('cluster')
In [41]: plt.figure(figsize=(8,6)) plt.plot(x1,y1,'og') plt.plot(x1,x2,'og') plt.plot(y1,x1,'og') plt.plot(y1,y2,'og') plt.xlim([-2.5,2.5]) plt.ylim([-2.5,2.5]) plt.plot(np.mean(x1),np.mean(y1),'*r',markersize=15) plt.plot(np.mean(x1),np.mean(x2),'*r',markersize=15) plt.plot(np.mean(y1),np.mean(x1),'*r',markersize=15) plt.plot(np.mean(y1),np.mean(y2),'*r',markersize=15) plt.savefig('cluster2')
In [51]: my_data=pd.read_csv('C:\Python\Scripts\my_data\\tmdb_5000_movies.csv') my_data.describe() Out[51]: budget id popularity revenue runtime \ count 4.803000e+03 4803.000000 4803.000000 4.803000e+03 4801.000000 mean 2.904504e+07 57165.484281 21.492301 8.226064e+07 106.875859 std 4.072239e+07 88694.614033 31.816650 1.628571e+08 22.611935 min 0.000000e+00 5.000000 0.000000 0.000000e+00 0.000000 25% 7.900000e+05 9014.500000 4.668070 0.000000e+00 94.000000 50% 1.500000e+07 14629.000000 12.921594 1.917000e+07 103.000000 75% 4.000000e+07 58610.500000 28.313505 9.291719e+07 118.000000 max 3.800000e+08 459488.000000 875.581305 2.787965e+09 338.000000 vote_average vote_count count 4803.000000 4803.000000 mean 6.092172 690.217989 std 1.194612 1234.585891 min 0.000000 0.000000 25% 5.600000 54.000000 50% 6.200000 235.000000 75% 6.800000 737.000000 max 10.000000 13752.000000 In [57]: from sklearn.cluster import KMeans X=my_data[['budget','popularity','revenue']] km=KMeans(n_clusters=3,random_state=1) km.fit(X) my_cl=pd.DataFrame(data=km.labels_,columns=['cluster']) X=pd.concat([X,my_cl],axis=1) X.head(5) Out[57]: budget popularity revenue cluster 0 237000000 150.437577 2787965087 2 1 300000000 139.082615 961000000 2 2 245000000 107.376788 880674609 2 3 250000000 112.312950 1084939099 2 4 260000000 43.926995 284139100 0 In [58]: X.groupby('cluster').mean() Out[58]: budget popularity revenue cluster 0 7.318659e+07 45.302377 2.566544e+08 1 1.721542e+07 14.292629 2.707764e+07 2 1.496765e+08 110.824122 8.091626e+08 In [126]: x=X['budget'] y=X['popularity'] z=X['revenue'] colors=list() palette={0:"red",1:"green",2:"blue"} # 字典,给三种类别对应散点图中的三种 marker_color for n,row in enumerate(X['cluster']): # 根据类别为每个样本设置绘图颜色 colors.append(palette[X['cluster'][n]])# fig = plt.figure(figsize=(12,10)) ax = fig.gca(projection='3d') ax.scatter(x,y,z,color=colors) ax.set_xlim(0,2e8) ax.set_zlim(0,1e9) ax.set_xlabel('budget',size=15) ax.set_ylabel('popularity',size=15) ax.set_zlabel('revenue',size=15) fig.savefig('cluster3')