- 用PCA对小麦数据的特征进行降维,降到二维
- 将用PCA降维后的数据送入ANN(人工神经网络)进行训练
- 用训练好的ANN进行分类
将txt文件转成pandas表格
from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt import numpy import pandas as pd import numpy as np
fp=open('seeds_dataset.txt','r') ls=[] for line in fp: line=line.strip('\n') #将\n去掉 ls.append(line.split('\t')) #将空格作为分隔符将一个字符切割成一个字符数组 fp.close() ls=numpy.array(ls,dtype=float) #将其转换成numpy的数组,并定义数据类型为float print(ls)
# 将文件转为DataFrame表格 ls_pd = pd.DataFrame(ls)
把数据打乱
from sklearn.utils import shuffle ls_pd = shuffle(ls_pd)
y = ls_pd.loc[:,7]
对数据进行预处理
from sklearn.preprocessing import StandardScaler sc = StandardScaler() data_std = sc.fit_transform(ls_pd.loc[:,1:6])
PCA特征降维
def plot_PCA(*data): ''' 绘制经过 KernelPCA 降维到二维之后的样本点''' X,y=data kernels=['linear','poly','rbf','sigmoid'] fig=plt.figure() colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)# 颜色集合,不同标记的样本染不同的颜色 kpca= PCA(n_components=2) kpca.fit(X) X_r=kpca.transform(X)# 原始数据集转换到二维 for label ,color in zip( np.unique(y),colors): position=y==label plt.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label, color=color) plt.suptitle("PCA") plt.show() return X_r
reduced_x = plot_PCA(data_std,y)
进行one-hot编码
y_onehot = list() for value in y: letter = [0 for _ in range(3)] letter[int(value)-1] = 1 y_onehot.append(letter) print(y_onehot)
对数据集进行分割
from sklearn.model_selection import train_test_split x_train, x_test = train_test_split(reduced_x, test_size=0.2, shuffle = False) y_train, y_test = train_test_split(y, test_size=0.2, shuffle = False) y_train_onehot, y_test_onehot = train_test_split(y_onehot, test_size=0.2, shuffle = False) y_onehot = np.array(y_onehot) y_train_onehot, y_test_onehot = train_test_split(y_onehot, test_size=0.2, shuffle = False)
ANN分类
from tensorflow import keras
建立模型
model = keras.Sequential([ keras.layers.Dense(500,activation='relu',input_shape=[2]),#输入特征数目为4 keras.layers.Dense(500,activation='relu'), keras.layers.Dense(250,activation='relu'), keras.layers.Dense(250,activation='relu'), keras.layers.Dense(3, activation='softmax')])#输出的类别为3个,所以输出层3个节点
编译模型
# 编译模型,定义损失函数loss,采用的优化器optimizer为Adam model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
拟合模型
model.fit(x_train,y_train_onehot,batch_size = 32,epochs=20)#训练1000个批次,每个批次数据量为126
用模型进行预测
y_pre=model.predict(x_test).argmax(axis=1)#开始预测,axis=1表示返回每行中数值(表示每个类别的概率)最大的下标,就是对应的标签
y_pre = y_pre + 1
模型评估
import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score, recall_score acu = accuracy_score(y_test, y_pre) # 准确率 recall = recall_score(y_test, y_pre, average="macro") # 召回率
画出分类结果
from matplotlib.colors import ListedColormap import matplotlib.pyplot as plt import numpy as np def plot_decision_regions(X, y, classifier,test_idx = None, resolution=0.02): #setup marker generator and colormap markers = ('s','x','o','^','v') colors = ('red','blue','lightgreen','gray','cyan') cmap = ListedColormap(colors[: len(np.unique(y))]) # plot the decision surface x1_min, x1_max = X[:,0].min() -1, X[:,0].max()+1 x2_min, x2_max = X[:,1].min() -1, X[:,1].max()+1 # X[:,k] 冒号左边表示行范围,读取所有行,冒号右边表示列范围,读取第K列 xx1, xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution), np.arange(x2_min,x2_max,resolution)) #arange(start,end,step) 返回一个一维数组 #meshgrid(x,y)产生一个以x为行,y为列的矩阵 #xx1是一个(305*235)大小的矩阵 xx1.ravel()是将所有的行放在一个行里面的长度71675的一维数组 #xx2同理 Z=classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T).argmax(axis=1) + 1 #np.array([xx1.ravel(), xx2.ravel()]) 生成了一个 (2*71675)的矩阵 # xx1.ravel() = (1,71675) #xx1.shape = (305,205) 将Z重新调整为(305,205)的格式 Z = Z.reshape(xx1.shape) plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) # plot class samples print(np.unique(y)) # idx = 0,1 cl = -1 1 for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1], alpha=0.8, c=cmap(idx), marker = markers[idx],label = cl) #highlight test samples #增加的模块 if test_idx: X_test, y_test = X[test_idx:,:],y[test_idx:] plt.scatter(X_test[:,0],X_test[:,1],c='',edgecolors='0', alpha=1.0, linewidths=1,marker='o', s=55, label='test set')
plot_decision_regions(reduced_x, y, classifier=model, test_idx=167) plt.legend(loc='upper left') plt.tight_layout() #紧凑显示图片,居中显示;避免出现叠影 plt.show()
处理后的结果如下图: