07 数据归一化处理
import numpy as np import matplotlib.pyplot as plt
最值归一化Normalization
x = np.random.randint(1, 100, size = 100) (x - np.min(x)) / (np.max(x) - np.min(x)) # 最值归一化 # 对矩阵的处理 X = np.random.randint(0, 100, (50, 2)) X = np.array(X, dtype=float) # 转换成能取小数的类型 X[:,0] = (X[:, 0] - np.min(X[:, 0])) / (np.max(X[:, 0]) - np.min(X[:, 0])) X[:,1] = (X[:, 1] - np.min(X[:, 1])) / (np.max(X[:, 1]) - np.min(X[:, 1])) plt.scatter(X[:,0], X[:,1]) plt.show()
# 查看最值归一化方法的性质 np.mean(X[:,0]) # 第一列均值 # Out[13]: # 0.55073684210526319 np.std(X[:,0]) # 第一列方差 # Out[14]: # 0.29028548370502699 np.mean(X[:,1]) # 第二列均值 # Out[15]: # 0.50515463917525782 np.std(X[:,1]) # 第二列方差 # Out[16]: # 0.29547909688276441
均值方差归一化Standardization
X2 = np.random.randint(0, 100, (50, 2)) X2 = np.array(X2, dtype=float) X2[:,0] = (X2[:,0] - np.mean(X2[:,0])) / np.std(X2[:,0]) X2[:,1] = (X2[:,1] - np.mean(X2[:,1])) / np.std(X2[:,1]) plt.scatter(X2[:,0], X2[:,1]) plt.show()
# 查看均值方差归一化方法性质 np.mean(X2[:,0]) # 查看均值 # Out[24]: # -3.9968028886505634e-17 np.std(X2[:,0]) # 查看方差 # Out[25]: # 0.99999999999999989 np.mean(X2[:,1]) # Out[26]: # -3.552713678800501e-17 np.std(X2[:,1]) # Out[27]: # 1.0
注意对测试数据集的归一化方法:由于测试数据集模拟的是真实的数据,在实际应用中可能只有一个数据,此时如果对其本身求均值意义不大,所以此处减去训练数据的均值再除以方差。
08 Scikit-Learn中的Scaler
Scikit-Learn中专门为数据归一化操作提供了专门的类,和kNN类的对象一样,需要先进行fit操作之后再执行归一化操作,如下:
数据准备
import numpy as np from sklearn import datasets iris = datasets.load_iris() X = iris.data y = iris.target from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state=666)
scikit-learn中的StandardScaler
# 导入均值方差归一化对象 from sklearn.preprocessing import StandardScaler standardScaler = StandardScaler() standardScaler.fit(X_train) StandardScaler(copy=True, with_mean=True, with_std=True) # 查看归一化方法性质 standardScaler.mean_ # Out[9]: # array([ 5.83416667, 3.0825 , 3.70916667, 1.16916667]) standardScaler.scale_ # 标准差 # Out[10]: # array([ 0.81019502, 0.44076874, 1.76295187, 0.75429833]) X_train = standardScaler.transform(X_train) # 训练数据经过归一化之后测试数据也应该进行归一化操作 X_test_standard = standardScaler.transform(X_test) from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(n_neighbors=3) knn_clf.fit(X_train, y_train) """ Out[17]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='uniform') """ knn_clf.score(X_test_standard, y_test) # Out[18]: # 1.0 knn_clf.score(X_test, y_test) # 此结果有误,传进来的测试数据集也必须和训练数据集一样归一化 # Out[19]: # 0.33333333333333331