# 机器学习之PyTorch和Scikit-Learn第4章 构建优秀的训练数据集 - 数据预处理Part 2

## 将数据集划分为训练集和测试集

df = pd.read_csv(
'https://archive.ics.uci.edu/ml/'
'machine-learning-databases/wine/wine.data',
)


df = pd.read_csv(
)


>>> df_wine = pd.read_csv('https://archive.ics.uci.edu/'
...                       'ml/machine-learning-databases/'
>>> df_wine.columns = ['Class label', 'Alcohol',
...                    'Malic acid', 'Ash',
...                    'Alcalinity of ash', 'Magnesium',
...                    'Total phenols', 'Flavanoids',
...                    'Nonflavanoid phenols',
...                    'Proanthocyanins',
...                    'Color intensity', 'Hue',
...                    'OD280/OD315 of diluted wines',
...                    'Proline']
>>> print('Class labels', np.unique(df_wine['Class label']))
Class labels [1 2 3]


>>> from sklearn.model_selection import train_test_split
>>> X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
>>> X_train, X_test, y_train, y_test =\
...     train_test_split(X, y,
...                      test_size=0.3,
...                      random_state=0,
...                      stratify=y)


## 使特征处于同一量级

>>> from sklearn.preprocessing import MinMaxScaler
>>> mms = MinMaxScaler()
>>> X_train_norm = mms.fit_transform(X_train)
>>> X_test_norm = mms.transform(X_test)


0.0 -1.46385 0.0
1.0 -0.87831 0.2
2.0 -0.29277 0.4
3.0 0.29277 0.6
4.0 0.87831 0.8
5.0 1.46385 1.0

>>> ex = np.array([0, 1, 2, 3, 4, 5])
>>> print('standardized:', (ex - ex.mean()) / ex.std())
standardized: [-1.46385011  -0.87831007  -0.29277002  0.29277002
0.87831007  1.46385011]
>>> print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))
normalized: [ 0.  0.2  0.4  0.6  0.8  1. ]


>>> from sklearn.preprocessing import StandardScaler
>>> stdsc = StandardScaler()
>>> X_train_std = stdsc.fit_transform(X_train)
>>> X_test_std = stdsc.transform(X_test)


scikit-learn中还有其它用于特征缩放的高阶方法，如RobustScaler。在处理包含离群数据的小数据集时推荐使用RobustScaler。同样，如果对数据集应用的机器学习算法偏向过拟合，RobustScaler也会是个好选择。RobustScaler对每个特征列独立操作，删除中间值并根据数据集的第一个和第三个四分位（即25%和75%处）缩放数据，这样极值和离群值就不那么显著了。感兴趣的读者可以在scikit-learn的官方文档中阅读RobustScaler的详细介绍：https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html

## 选择有意义的特征

• 采集更多的训练数据
• 通过正则化引入对复杂性的惩罚
• 选择更少参数的更简单模型
• 降低数据的维度

### L1正则化的稀疏解

L1正则化与稀疏性

L1正则为什么会导致稀疏解的数学细节不在本书讨论范畴内。如果读者对此感兴趣，可阅读T revor Hastie, Robert TibshiraniJerome Friedman所著《统计学习基础》（施普林格科学与商业媒体，2009年）的第3.4节中关于L2对比L1正则化的精彩讲解。

>>> from sklearn.linear_model import LogisticRegression
>>> LogisticRegression(penalty='l1',
...                    solver='liblinear',
...                    multi_class='ovr')


>>> lr = LogisticRegression(penalty='l1',
...                         C=1.0,
...                         solver='liblinear',
...                         multi_class='ovr')
>>> # Note that C=1.0 is the default. You can increase
>>> # or decrease it to make the regularization effect
>>> # stronger or weaker, respectively.
>>> lr.fit(X_train_std, y_train)
>>> print('Training accuracy:', lr.score(X_train_std, y_train))
Training accuracy: 1.0
>>> print('Test accuracy:', lr.score(X_test_std, y_test))
Test accuracy: 1.0


>>> lr.intercept_
array([-1.26317363, -1.21537306, -2.37111954])


>>> lr.coef_
array([[ 1.24647953,  0.18050894,  0.74540443, -1.16301108,
0.        ,0.        ,  1.16243821,  0.        ,
0.        ,  0.        , 0.        ,  0.55620267,
2.50890638],
[-1.53919461, -0.38562247, -0.99565934,  0.36390047,
-0.05892612, 0.        ,  0.66710883,  0.        ,
0.        , -1.9318798 , 1.23775092,  0.        ,
-2.23280039],
[ 0.13557571,  0.16848763,  0.35710712,  0.        ,
0.        , 0.        , -2.43804744,  0.        ,
0.        ,  1.56388787, -0.81881015, -0.49217022,
0.        ]])


>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = plt.subplot(111)
>>> colors = ['blue', 'green', 'red', 'cyan',
...           'magenta', 'yellow', 'black',
...           'pink', 'lightgreen', 'lightblue',
...           'gray', 'indigo', 'orange']
>>> weights, params = [], []
>>> for c in np.arange(-4., 6.):
...     lr = LogisticRegression(penalty='l1', C=10.**c,
...                             solver='liblinear',
...                             multi_class='ovr', random_state=0)
...     lr.fit(X_train_std, y_train)
...     weights.append(lr.coef_[1])
...     params.append(10**c)
>>> weights = np.array(weights)
>>> for column, color in zip(range(weights.shape[1]), colors):
...     plt.plot(params, weights[:, column],
...              label=df_wine.columns[column + 1],
...              color=color)
>>> plt.axhline(0, color='black', linestyle='--', linewidth=3)
>>> plt.xlim([10**(-5), 10**5])
>>> plt.ylabel('Weight coefficient')
>>> plt.xlabel('C (inverse regularization strength)')
>>> plt.xscale('log')
>>> plt.legend(loc='upper left')
>>> ax.legend(loc='upper center',
...           bbox_to_anchor=(1.38, 1.03),
...           ncol=1, fancybox=True)
>>> plt.show()


### 序列特征选择算法

SBS算法背后的思想非常简单：SBS从全部特征子集序列删除特征，直至新特征子空间包含所需的特征数。要决定在各阶段删除哪个特征，我们需要定义一个我信希望最小化的判别函数J

1. 通过k = d初始化算法，其中d是完整特征空间Xd的维度。
2. 确定最大化标准的x–：x– = argmax J(Xk – x)，其中$x\in X_k$。
3. 从特征集中删除特征x–：Xk–1 = Xk – x–; k = k – 1。
4. 如果k等于所需特征数终止，否则回到步骤2。

from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
class SBS:
def __init__(self, estimator, k_features,
scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)

dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.k_features:
scores = []
subsets = []

for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train, y_train,
X_test, y_test, p)
scores.append(score)
subsets.append(p)

best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1

self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]

return self

def transform(self, X):
return X[:, self.indices_]

def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score


fit方法的while循环内部，由itertools.combination函数创建的特征子集进行了评估，减少至特征子集为所需维度。在每次迭代中，将最佳子集根据内部创建的测试数据集X_test的准确度打分收入列表self.scores_中。稍后我们会使用这些分数评估结果。最终特征子集的列索引赋值给self.indices_，可通过transform方法使用它返回带已选特征列的新数组。注意除了在fit方法内部显式地计算标准，我们只是删除了不在最佳性能特征子集中的特征。

>>> import matplotlib.pyplot as plt
>>> from sklearn.neighbors import KNeighborsClassifier
>>> knn = KNeighborsClassifier(n_neighbors=5)
>>> sbs = SBS(knn, k_features=1)
>>> sbs.fit(X_train_std, y_train)


SBS算法收集每个阶段的最佳特征子集，所以我们进入实现中更有意思的部分，并绘制KNN分类器对验证数据集所计算的分类准确度。代码如下：

>>> k_feat = [len(k) for k in sbs.subsets_]
>>> plt.plot(k_feat, sbs.scores_, marker='o')
>>> plt.ylim([0.7, 1.02])
>>> plt.ylabel('Accuracy')
>>> plt.xlabel('Number of features')
>>> plt.grid()
>>> plt.tight_layout()
>>> plt.show()


>>> k3 = list(sbs.subsets_[10])
>>> print(df_wine.columns[1:][k3])
Index(['Alcohol', 'Malic acid', 'OD280/OD315 of diluted wines'], dtype='object')


>>> knn.fit(X_train_std, y_train)
>>> print('Training accuracy:', knn.score(X_train_std, y_train))
Training accuracy: 0.967741935484
>>> print('Test accuracy:', knn.score(X_test_std, y_test))
Test accuracy: 0.962962962963


>>> knn.fit(X_train_std[:, k3], y_train)
>>> print('Training accuracy:',
...       knn.score(X_train_std[:, k3], y_train))
Training accuracy: 0.951612903226
>>> print('Test accuracy:',
...       knn.score(X_test_std[:, k3], y_test))
Test accuracy: 0.925925925926


scikit-learn中的特征选择算法

## 使用随机森森评估特征重要性

>>> from sklearn.ensemble import RandomForestClassifier
>>> feat_labels = df_wine.columns[1:]
>>> forest = RandomForestClassifier(n_estimators=500,
...                                 random_state=1)
>>> forest.fit(X_train, y_train)
>>> importances = forest.feature_importances_
>>> indices = np.argsort(importances)[::-1]
>>> for f in range(X_train.shape[1]):
...     print("%2d) %-*s %f" % (f + 1, 30,
...                             feat_labels[indices[f]],
...                             importances[indices[f]]))
>>> plt.title('Feature importance')
>>> plt.bar(range(X_train.shape[1]),
...         importances[indices],
...         align='center')
>>> plt.xticks(range(X_train.shape[1]),
...            feat_labels[indices], rotation=90)
>>> plt.xlim([-1, X_train.shape[1]])
>>> plt.tight_layout()
>>> plt.show()
1) Proline                         0.185453
2) Flavanoids                      0.174751
3) Color intensity                 0.143920
4) OD280/OD315 of diluted wines    0.136162
5) Alcohol                         0.118529
6) Hue                             0.058739
7) Total phenols                   0.050872
8) Magnesium                       0.031357
9) Malic acid                      0.025648
10) Proanthocyanins                0.025570
11) Alcalinity of ash              0.022366
12) Nonflavanoid phenols           0.013354
13) Ash                            0.013279


>>> from sklearn.feature_selection import SelectFromModel
>>> sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
>>> X_selected = sfm.transform(X_train)
>>> print('Number of features that meet this threshold',
...       'criterion:', X_selected.shape[1])
Number of features that meet this threshold criterion: 5
>>> for f in range(X_selected.shape[1]):
...     print("%2d) %-*s %f" % (f + 1, 30,
...                             feat_labels[indices[f]],
...                             importances[indices[f]]))
1) Proline                         0.185453
2) Flavanoids                      0.174751
3) Color intensity                 0.143920
4) OD280/OD315 of diluted wines    0.136162
5) Alcohol                         0.118529


## 小结

|
16天前
|

34 1
|
1天前
|

【机器学习】Qwen2大模型原理、训练及推理部署实战
【机器学习】Qwen2大模型原理、训练及推理部署实战
7 0
|
14天前
|

【7月更文挑战第31天】机器学习已深深嵌入日常生活，从智能推荐到自动驾驶皆为其应用。PyTorch作为一个开源库，凭借简洁API、动态计算图及GPU加速能力，降低了学习门槛并提高了开发效率。通过一个使用PyTorch构建简单CNN识别MNIST手写数字的例子，展现了如何快速搭建神经网络。随着技能提升，开发者能运用PyTorch及其丰富的生态系统（如torchvision、torchtext和torchaudio）应对复杂场景，如自然语言处理和强化学习。掌握PyTorch，意味着掌握了数据时代的关键技能。
10 1
|
27天前
|

【7月更文第18天】在当今数据驱动的世界中，机器学习已成为解锁数据潜力的关键。Python凭借其简洁的语法和丰富的库生态，成为数据科学家和机器学习工程师的首选语言。而在Python的众多机器学习库中，Scikit-learn以其全面、高效、易用的特点，被誉为机器学习领域的“瑞士军刀”。本文旨在深入探讨Scikit-learn的核心概念、实用功能，并通过实战代码示例，带你领略其强大之处。
60 12
|
1天前
|

11 0
|
5天前
|

【python机器学习】python电商数据K-Means聚类分析可视化（源码+数据集+报告）【独一无二】
【python机器学习】python电商数据K-Means聚类分析可视化（源码+数据集+报告）【独一无二】
14 0
|
16天前
|

21 0
|
1月前
|

Java中的机器学习模型集成与训练
Java中的机器学习模型集成与训练
25 0
|
1天前
|

AI智能体研发之路-模型篇（五）：pytorch vs tensorflow框架DNN网络结构源码级对比
AI智能体研发之路-模型篇（五）：pytorch vs tensorflow框架DNN网络结构源码级对比
9 1