# 十三、交叉验证和得分方法

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

X, y = iris.data, iris.target

classifier = KNeighborsClassifier()


y


import numpy as np
rng = np.random.RandomState(0)

permutation = rng.permutation(len(X))
X, y = X[permutation], y[permutation]
print(y)


k = 5
n_samples = len(X)
fold_size = n_samples // k
scores = []
for fold in range(k):
# 为此折叠中的测试集生成一个布尔掩码
test_mask[fold * fold_size : (fold + 1) * fold_size] = True
# 为可视化存储掩码
# 使用此掩码创建训练和测试集
# 拟合分类器
classifier.fit(X_train, y_train)
# 计算得分并记录
scores.append(classifier.score(X_test, y_test))


import matplotlib.pyplot as plt
%matplotlib inline


print(scores)
print(np.mean(scores))


from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X, y)
print('Scores on each CV fold: %s' % scores)
print('Mean score: %0.3f' % np.mean(scores))


cross_val_score(classifier, X, y, cv=5)


from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit


StratifiedKFold也消除了我们打乱鸢尾花的需要。 让我们看看在未打乱的鸢尾花数据集上，它生成什么类型的折叠。 每个交叉验证类都是训练和测试索引的集合的生成器：

cv = StratifiedKFold(n_splits=5)
for train, test in cv.split(iris.data, iris.target):
print(test)


def plot_cv(cv, features, labels):
for train, test in cv.split(features, labels):

plot_cv(StratifiedKFold(n_splits=5), iris.data, iris.target)


plot_cv(KFold(n_splits=5), iris.data, iris.target)


plot_cv(KFold(n_splits=10), iris.data, iris.target)


plot_cv(ShuffleSplit(n_splits=5, test_size=.2), iris.data, iris.target)


plot_cv(ShuffleSplit(n_splits=20, test_size=.2), iris.data, iris.target)


cv = ShuffleSplit(n_splits=5, test_size=.2)
cross_val_score(classifier, X, y, cv=cv)


# %load solutions/13_cross_validation.py


# 十四、参数选择、验证和测试

## 超参数、过拟合和欠拟合

from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
# 生成玩具数据集
x = np.linspace(-3, 3, 100)
rng = np.random.RandomState(42)
y = np.sin(4 * x) + x + rng.normal(size=len(x))
X = x[:, np.newaxis]

cv = KFold(shuffle=True)

# 对每个参数设置执行交叉验证
for n_neighbors in [1, 3, 5, 10, 20]:
scores = cross_val_score(KNeighborsRegressor(n_neighbors=n_neighbors), X, y, cv=cv)
print("n_neighbors: %d, average score: %f" % (n_neighbors, np.mean(scores)))


scikit-learn 中有一个函数，称为validation_plot，用于重现上面的卡通图。 它根据训练和验证误差（使用交叉验证）绘制一个参数，例如邻居的数量：

from sklearn.model_selection import validation_curve
n_neighbors = [1, 3, 5, 10, 20, 50]
train_scores, test_scores = validation_curve(KNeighborsRegressor(), X, y, param_name="n_neighbors",
param_range=n_neighbors, cv=cv)
plt.plot(n_neighbors, train_scores.mean(axis=1), label="train accuracy")
plt.plot(n_neighbors, test_scores.mean(axis=1), label="test accuracy")

plt.ylabel('Accuracy')
plt.xlabel('Number of neighbors')
plt.xlim([50, 0])
plt.legend(loc="best");


from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVR

# 对每个参数设置执行交叉验证
for C in [0.001, 0.01, 0.1, 1, 10]:
for gamma in [0.001, 0.01, 0.1, 1]:
scores = cross_val_score(SVR(C=C, gamma=gamma), X, y, cv=cv)
print("C: %f, gamma: %f, average score: %f" % (C, gamma, np.mean(scores)))


from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=cv, verbose=3, return_train_score=True)


GridSearchCV的一大优点是它是一个元估计器。 它需要像上面的 SVR 这样的估计器，并创建一个新的估计器，其行为完全相同 - 在这种情况下，就像一个回归器。 所以我们可以调用它的fit来训练：

grid.fit(X, y)


fit所做的比我们上面做的复杂得多。 首先，它使用交叉验证运行相同的循环，来找到最佳参数组合。 一旦它具有最佳组合，它在所有传给fit的数据上再次执行fit（无交叉验证），来使用最佳参数设置构建单个新模型。

grid.predict(X)


print(grid.best_score_)

print(grid.best_params_)


type(grid.cv_results_)

print(grid.cv_results_.keys())

import pandas as pd

cv_results = pd.DataFrame(grid.cv_results_)

cv_results_tiny = cv_results[['param_C', 'param_gamma', 'mean_test_score']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
cv = KFold(n_splits=10, shuffle=True)

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=cv)

grid.fit(X_train, y_train)
grid.score(X_test, y_test)


grid.best_params_


from sklearn.model_selection import train_test_split, ShuffleSplit

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
single_split_cv = ShuffleSplit(n_splits=1)

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=single_split_cv, verbose=3)

grid.fit(X_train, y_train)
grid.score(X_test, y_test)


clf = GridSearchCV(SVR(), param_grid=param_grid)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


# 十五、估计器流水线

## 特征提取：向量化器

import os

with open(os.path.join("datasets", "smsspam", "SMSSpamCollection")) as f:
lines = [line.strip().split("\t") for line in f.readlines()]
text = [x[1] for x in lines]
y = [x[0] == "ham" for x in lines]

from sklearn.model_selection import train_test_split

text_train, text_test, y_train, y_test = train_test_split(text, y)


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

clf.score(X_test, y_test)


from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipeline.fit(text_train, y_train)
pipeline.score(text_test, y_test)


# This illustrates a common mistake. Don't use this code!
from sklearn.model_selection import GridSearchCV

vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression()
grid = GridSearchCV(clf, param_grid={'C': [.1, 1, 10, 100]}, cv=5)
grid.fit(X_train, y_train)


## 我们哪里做错了？

from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(TfidfVectorizer(),
LogisticRegression())

grid = GridSearchCV(pipeline,
param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5)

grid.fit(text_train, y_train)
grid.score(text_test, y_test)


from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

params = {'logisticregression__C': [.1, 1, 10, 100],
"tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (2, 2)]}
grid = GridSearchCV(pipeline, param_grid=params, cv=5)
grid.fit(text_train, y_train)
print(grid.best_params_)
grid.score(text_test, y_test)


# %load solutions/15A_ridge_grid.py


# 十六、模型评估、得分指标和处理不平衡类别

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=2)

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=1,
stratify=y,
test_size=0.25)

classifier = LinearSVC(random_state=1).fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

print("Accuracy: {}".format(classifier.score(X_test, y_test)))


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)


plt.matshow(confusion_matrix(y_test, y_test_pred), cmap="Blues")
plt.colorbar(shrink=0.8)
plt.xticks(range(10))
plt.yticks(range(10))
plt.xlabel("Predicted label")
plt.ylabel("True label");


Precision = TP / (TP + FP)


Recall = TP / (TP + FN)


F1 得分是二者的调和均值：

F1 = 2 x (precision x recall) / (precision + recall)


from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))


• 不平衡类别，即一个类可能比另一个类更频繁。
• 非对称成本，即一种错误比另一种更“昂贵”。

np.bincount(y) / y.shape[0]


X, y = digits.data, digits.target == 3


from sklearn.dummy import DummyClassifier
cross_val_score(DummyClassifier("most_frequent"), X, y)


np.bincount(y) / y.shape[0]


## ROC 曲线

from sklearn.metrics import roc_curve, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

for gamma in [.05, 0.1, 0.5]:
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (recall)")
svm = SVC(gamma=gamma).fit(X_train, y_train)
decision_function = svm.decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, decision_function)
acc = svm.score(X_test, y_test)
auc = roc_auc_score(y_test, svm.decision_function(X_test))
label = "gamma: %0.3f, acc:%.2f auc:%.2f" % (gamma, acc, auc)
plt.plot(fpr, tpr, label=label, linewidth=3)
plt.legend(loc="best");


from sklearn.model_selection import cross_val_score
cross_val_score(SVC(gamma='auto'), X, y, scoring="roc_auc", cv=5)


## 内建和自定义的得分函数

from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())


def my_accuracy_scoring(est, X, y):
return np.mean(est.predict(X) == y)

cross_val_score(SVC(), X, y, scoring=my_accuracy_scoring)


ACC = (TP + TN) / n


ACC = T / N


y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2])
y_pred = np.array([0, 1, 1, 0, 1, 1, 2, 2, 2, 2])

confusion_matrix(y_true, y_pred)



# 十七、深入：线性模型

## 用于回归的线性模型

y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_


from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y, true_coefficient = make_regression(n_samples=200, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, train_size=60, test_size=140)
print(X_train.shape)
print(y_train.shape)


## 线性回归

from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression().fit(X_train, y_train)
print("R^2 on training set: %f" % linear_regression.score(X_train, y_train))
print("R^2 on test set: %f" % linear_regression.score(X_test, y_test))

from sklearn.metrics import r2_score
print(r2_score(np.dot(X, true_coefficient), y))

plt.figure(figsize=(10, 5))
coefficient_sorting = np.argsort(true_coefficient)[::-1]
plt.plot(true_coefficient[coefficient_sorting], "o", label="true")
plt.plot(linear_regression.coef_[coefficient_sorting], "o", label="linear regression")

plt.legend()

from sklearn.model_selection import learning_curve

def plot_learning_curve(est, X, y):
training_set_size, train_scores, test_scores = learning_curve(est, X, y, train_sizes=np.linspace(.1, 1, 20))
estimator_name = est.__class__.__name__
line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label="training scores " + estimator_name)
plt.plot(training_set_size, test_scores.mean(axis=1), '-', label="test scores " + estimator_name, c=line[0].get_color())
plt.xlabel('Training set size')
plt.legend(loc='best')
plt.ylim(-0.1, 1.1)

plt.figure()
plot_learning_curve(LinearRegression(), X, y)


## 岭回归（L2 惩罚）

from sklearn.linear_model import Ridge
ridge_models = {}
training_scores = []
test_scores = []

for alpha in [100, 10, 1, .01]:
ridge = Ridge(alpha=alpha).fit(X_train, y_train)
training_scores.append(ridge.score(X_train, y_train))
test_scores.append(ridge.score(X_test, y_test))
ridge_models[alpha] = ridge

plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [100, 10, 1, .01])
plt.xlabel('alpha')
plt.legend(loc="best")

plt.figure(figsize=(10, 5))
plt.plot(true_coefficient[coefficient_sorting], "o", label="true", c='b')

for i, alpha in enumerate([100, 10, 1, .01]):
plt.plot(ridge_models[alpha].coef_[coefficient_sorting], "o", label="alpha = %.2f" % alpha, c=plt.cm.viridis(i / 3.))

plt.legend(loc="best")


plt.figure()
plot_learning_curve(LinearRegression(), X, y)
plot_learning_curve(Ridge(alpha=10), X, y)


## Lasso（L1 惩罚）

Lasso估计器可用于对系数施加稀疏性。 换句话说，如果我们认为许多特征不相关，那么我们会更喜欢它。 这是通过所谓的 l1 惩罚来完成的。

from sklearn.linear_model import Lasso

lasso_models = {}
training_scores = []
test_scores = []

for alpha in [30, 10, 1, .01]:
lasso = Lasso(alpha=alpha).fit(X_train, y_train)
training_scores.append(lasso.score(X_train, y_train))
test_scores.append(lasso.score(X_test, y_test))
lasso_models[alpha] = lasso
plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [30, 10, 1, .01])
plt.legend(loc="best")

plt.figure(figsize=(10, 5))
plt.plot(true_coefficient[coefficient_sorting], "o", label="true", c='b')

for i, alpha in enumerate([30, 10, 1, .01]):
plt.plot(lasso_models[alpha].coef_[coefficient_sorting], "o", label="alpha = %.2f" % alpha, c=plt.cm.viridis(i / 3.))

plt.legend(loc="best")

plt.figure(figsize=(10, 5))
plot_learning_curve(LinearRegression(), X, y)
plot_learning_curve(Ridge(alpha=10), X, y)
plot_learning_curve(Lasso(alpha=10), X, y)


## 用于分类的线性模型

y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0


## LinearSVC中C的影响

LinearSVC中，C参数控制模型中的正则化。

from figures import plot_linear_svc_regularization
plot_linear_svc_regularization()


Ridge/Lasso划分类似，你可以将penalty参数设置为'l1'来强制系数的稀疏性（类似于Lasso）或'l2'来鼓励更小的系数（类似于Ridge）。

## 多类线性分类

from sklearn.datasets import make_blobs
plt.figure()
X, y = make_blobs(random_state=42)
plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=plt.cm.tab10(y))

from sklearn.svm import LinearSVC
linear_svm = LinearSVC().fit(X, y)
print(linear_svm.coef_.shape)
print(linear_svm.intercept_.shape)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=plt.cm.tab10(y))
line = np.linspace(-15, 15)
for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):
plt.plot(line, -(line * coef[0] + intercept) / coef[1])
plt.ylim(-10, 15)
plt.xlim(-10, 8);


from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression

X_digits, y_digits = digits.data, digits.target

# split the dataset, apply grid-search



# 十八、深入：决策树与森林

• 动物是大于还是小于一米？
• 较大：动物有角吗？
• 是的：角长是否超过十厘米？
• 不是：动物有项圈吗？
• 较小：动物有两条腿还是四条腿？
• 二：动物有翅膀吗？
• 四：动物有浓密的尾巴吗？

## 决策树回归

from figures import make_dataset
x, y = make_dataset()
X = x.reshape(-1, 1)

plt.figure()
plt.xlabel('Feature X')
plt.ylabel('Target y')
plt.scatter(X, y);

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(max_depth=5)
reg.fit(X, y)

X_fit = np.linspace(-3, 3, 1000).reshape((-1, 1))
y_fit_1 = reg.predict(X_fit)

plt.figure()
plt.plot(X_fit.ravel(), y_fit_1, color='tab:blue', label="prediction")
plt.plot(X.ravel(), y, 'C7.', label="training data")
plt.legend(loc="best");


## 决策树分类

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from figures import plot_2d_separator
from figures import cm2

X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

plt.figure()
plot_2d_separator(clf, X, fill=True)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm2, s=60, alpha=.7, edgecolor='k')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm2, s=60, edgecolor='k');


from figures import plot_tree
max_depth = 3
plot_tree(max_depth=max_depth)


## 随机森林

from figures import plot_forest
max_depth = 3
plot_forest(max_depth=max_depth)


## 通过交叉验证选择最优估计

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200)
parameters = {'max_features':['sqrt', 'log2', 10],
'max_depth':[5, 7, 9]}

clf_grid = GridSearchCV(rf, parameters, n_jobs=-1)
clf_grid.fit(X_train, y_train)

clf_grid.score(X_train, y_train)

clf_grid.score(X_test, y_test)


## 另一个选项：梯度提升

from sklearn.ensemble import GradientBoostingRegressor
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))


from sklearn.datasets import load_digits

X_digits, y_digits = digits.data, digits.target

# split the dataset, apply grid-search



## 特征的重要性

RandomForestGradientBoosting对象在拟合之后都会提供feature_importances_属性。 此属性是这些模型最强大的功能之一。 它们基本上量化了在不同树的节点中，每个特征对表现的贡献程度。

X, y = X_digits[y_digits < 2], y_digits[y_digits < 2]

rf = RandomForestClassifier(n_estimators=300, n_jobs=1)
rf.fit(X, y)
print(rf.feature_importances_)  # one value per feature

plt.figure()
plt.imshow(rf.feature_importances_.reshape(8, 8), cmap=plt.cm.viridis, interpolation='nearest')


# 十九、自动特征选择

## 单变量统计

from sklearn.datasets import load_breast_cancer, load_digits
from sklearn.model_selection import train_test_split

# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target,
random_state=0, test_size=.5)


from sklearn.feature_selection import SelectPercentile

# use f_classif (the default) and SelectPercentile to select 50% of features:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set:
X_train_selected = select.transform(X_train)

print(X_train.shape)
print(X_train_selected.shape)


from sklearn.feature_selection import f_classif, f_regression, chi2

F, p = f_classif(X_train, y_train)

plt.figure()
plt.plot(p, 'o')


mask = select.get_support()
# 展示掩码。黑色是真，白色是假


from sklearn.linear_model import LogisticRegression

# 转换测试数据
X_test_selected = select.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Score with all features: %f" % lr.score(X_test, y_test))
lr.fit(X_train_selected, y_train)
print("Score with only selected features: %f" % lr.score(X_test_selected, y_test))


## 基于模型的特征选择

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")

select.fit(X_train, y_train)
X_train_rf = select.transform(X_train)
print(X_train.shape)
print(X_train_rf.shape)

# 展示掩码。黑色是真，白色是假

X_test_rf = select.transform(X_test)
LogisticRegression().fit(X_train_rf, y_train).score(X_test_rf, y_test)


## 递归特征消除

from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)

select.fit(X_train, y_train)
# 可视化所选特征

X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)

LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)

select.score(X_test, y_test)


import numpy as np

rng = np.random.RandomState(1)

# 在 [0,1] 范围内生成 400 个随机整数
X = rng.randint(0, 2, (200, 2))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)  # XOR creation

plt.scatter(X[:, 0], X[:, 1], c=plt.cm.tab10(y))



# 二十、无监督学习：层次和基于密度的聚类算法

## 单个和完整链接

from sklearn.datasets import load_iris
from figures import cm3

X = iris.data[:, [2, 3]]
y = iris.target
n_samples, n_features = X.shape

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cm3)


from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram

metric='euclidean',
method='complete')

dendr = dendrogram(clusters)

plt.ylabel('Euclidean Distance')


from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=3,
affinity='euclidean',

prediction = ac.fit_predict(X)
print('Cluster labels: %s\n' % prediction)

plt.scatter(X[:, 0], X[:, 1], c=prediction, cmap=cm3)


## 基于密度的聚类 - DBSCAN

• 核心点：核心点是一个点，在其半径epsilon内，至少具有最小数量（MinPts）的其他点。
• 边界点：边界点是一个点，它不是核心点，因为它的邻域中没有足够的MinPts，但位于核心点的半径epsilon内。
• 噪点：所有其他的点，既不是核心点也不是边界点。

DBSCAN 的一个很好的特性是我们不必预先指定多少个簇。 但是，它需要设置其他超参数，例如MinPts的值和半径epsilon

from sklearn.datasets import make_moons
X, y = make_moons(n_samples=400,
noise=0.1,
random_state=1)
plt.scatter(X[:,0], X[:,1])
plt.show()
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.2,
min_samples=10,
metric='euclidean')
prediction = db.fit_predict(X)

print("Predicted labels:\n", prediction)

plt.scatter(X[:, 0], X[:, 1], c=prediction, cmap=cm3)


from sklearn.datasets import make_circles

X, y = make_circles(n_samples=1500,
factor=.4,
noise=.05)

plt.scatter(X[:, 0], X[:, 1], c=y);



# 二十一、无监督学习：非线性降维

## 流形学习

PCA 的一个弱点是它无法检测到非线性特征。 已经开发了一组称为流形学习的算法，来解决这个缺陷。流形学习中使用的规范数据集是 S 曲线：

from sklearn.datasets import make_s_curve
X, y = make_s_curve(n_samples=1000)

from mpl_toolkits.mplot3d import Axes3D
ax = plt.axes(projection='3d')

ax.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y)
ax.view_init(10, -60);


from sklearn.decomposition import PCA
X_pca = PCA(n_components=2).fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y);


from sklearn.manifold import Isomap

iso = Isomap(n_neighbors=15, n_components=2)
X_iso = iso.fit_transform(X)
plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y);


## 数字数据上的流形学习

from sklearn.datasets import load_digits

fig, axes = plt.subplots(2, 5, figsize=(10, 5),
subplot_kw={'xticks':(), 'yticks': ()})
for ax, img in zip(axes.ravel(), digits.images):
ax.imshow(img, interpolation="none", cmap="gray")


# 构建 PCA 模型
pca = PCA(n_components=2)
pca.fit(digits.data)
# 将数字数据转换为前两个主成分
digits_pca = pca.transform(digits.data)
colors = ["#476A2A", "#7851B8", "#BD3430", "#4A2D4E", "#875525",
"#A83683", "#4E655E", "#853541", "#3A3120","#535D8E"]
plt.figure(figsize=(10, 10))
plt.xlim(digits_pca[:, 0].min(), digits_pca[:, 0].max() + 1)
plt.ylim(digits_pca[:, 1].min(), digits_pca[:, 1].max() + 1)
for i in range(len(digits.data)):
# 实际上将数字绘制为文本而不是使用散点图
plt.text(digits_pca[i, 0], digits_pca[i, 1], str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight': 'bold', 'size': 9})
plt.xlabel("first principal component")
plt.ylabel("second principal component");


from sklearn.manifold import TSNE
tsne = TSNE(random_state=42)
# 使用 fit_transform 而不是 fit，因为 TSNE 没有 fit 方法
digits_tsne = tsne.fit_transform(digits.data)

plt.figure(figsize=(10, 10))
plt.xlim(digits_tsne[:, 0].min(), digits_tsne[:, 0].max() + 1)
plt.ylim(digits_tsne[:, 1].min(), digits_tsne[:, 1].max() + 1)
for i in range(len(digits.data)):
# 实际上将数字绘制为文本而不是使用散点图
plt.text(digits_tsne[i, 0], digits_tsne[i, 1], str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight': 'bold', 'size': 9})


t-SNE 比其他流形学习算法运行时间更长，但结果非常惊人。 请记住，此算法纯粹是无监督的，并且不知道类标签。 它仍然能够很好地分离类别（尽管类 4 和 类 9 已被分成多个分组）。

# %load solutions/21A_isomap_digits.py



# 二十二、无监督学习：异常检测

“异常值是一种数据集中的观测值，似乎与该组数据的其余部分不一致。”-- Johnson 1992

“异常值是一种观测值，与其他观测值有很大差异，引起人们怀疑它是由不同的机制产生的。”-- Outlier/Anomaly Hawkins 1980

## 异常检测设定的类型

• 标签可用于正常和异常数据
• 类似于稀有类挖掘/不平衡分类
• 只有正常的数据可供训练
• 该算法仅学习正常数据
• 没有标签，训练集 = 正常 + 异常数据
• 假设：异常非常罕见
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib
import matplotlib.pyplot as plt


## 生成数据集

from sklearn.datasets import make_blobs

X, y = make_blobs(n_features=2, centers=3, n_samples=500,
random_state=42)

X.shape

plt.figure()
plt.scatter(X[:, 0], X[:, 1])
plt.show()


## 使用密度估计的异常检测

from sklearn.neighbors.kde import KernelDensity

# 用高斯核密度估计器估算密度
kde = KernelDensity(kernel='gaussian')
kde = kde.fit(X)
kde

kde_X = kde.score_samples(X)
print(kde_X.shape)  # 包含数据的对数似然。 越小样本越罕见

from scipy.stats.mstats import mquantiles
alpha_set = 0.95
tau_kde = mquantiles(kde_X, 1. - alpha_set)

n_samples, n_features = X.shape
X_range = np.zeros((n_features, 2))
X_range[:, 0] = np.min(X, axis=0) - 1.
X_range[:, 1] = np.max(X, axis=0) + 1.

h = 0.1  # step size of the mesh
x_min, x_max = X_range[0]
y_min, y_max = X_range[1]
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

grid = np.c_[xx.ravel(), yy.ravel()]

Z_kde = kde.score_samples(grid)
Z_kde = Z_kde.reshape(xx.shape)

plt.figure()
c_0 = plt.contour(xx, yy, Z_kde, levels=tau_kde, colors='red', linewidths=3)
plt.clabel(c_0, inline=1, fontsize=15, fmt={tau_kde[0]: str(alpha_set)})
plt.scatter(X[:, 0], X[:, 1])
plt.show()


## 单类 SVM

from sklearn.svm import OneClassSVM

nu = 0.05  # theory says it should be an upper bound of the fraction of outliers
ocsvm = OneClassSVM(kernel='rbf', gamma=0.05, nu=nu)
ocsvm.fit(X)

X_outliers = X[ocsvm.predict(X) == -1]

Z_ocsvm = ocsvm.decision_function(grid)
Z_ocsvm = Z_ocsvm.reshape(xx.shape)

plt.figure()
c_0 = plt.contour(xx, yy, Z_ocsvm, levels=[0], colors='red', linewidths=3)
plt.clabel(c_0, inline=1, fontsize=15, fmt={0: str(alpha_set)})
plt.scatter(X[:, 0], X[:, 1])
plt.scatter(X_outliers[:, 0], X_outliers[:, 1], color='red')
plt.show()


## 支持向量 - 离群点

X_SV = X[ocsvm.support_]
n_SV = len(X_SV)
n_outliers = len(X_outliers)

print('{0:.2f} <= {1:.2f} <= {2:.2f}?'.format(1./n_samples*n_outliers, nu, 1./n_samples*n_SV))


• 绘制单类 SVM 决策函数的级别集，就像我们对真实密度所做的那样。
• 突出支持向量。
plt.figure()
plt.contourf(xx, yy, Z_ocsvm, 10, cmap=plt.cm.Blues_r)
plt.scatter(X[:, 0], X[:, 1], s=1.)
plt.scatter(X_SV[:, 0], X_SV[:, 1], color='orange')
plt.show()


# %load solutions/22_A-anomaly_ocsvm_gamma.py


## 隔离森林

from sklearn.ensemble import IsolationForest

iforest = IsolationForest(n_estimators=300, contamination=0.10)
iforest = iforest.fit(X)

Z_iforest = iforest.decision_function(grid)
Z_iforest = Z_iforest.reshape(xx.shape)

plt.figure()
c_0 = plt.contour(xx, yy, Z_iforest,
levels=[iforest.threshold_],
colors='red', linewidths=3)
plt.clabel(c_0, inline=1, fontsize=15,
fmt={iforest.threshold_: str(alpha_set)})
plt.scatter(X[:, 0], X[:, 1], s=1.)
plt.show()


# %load solutions/22_B-anomaly_iforest_n_trees.py


## 数字数据集上的图解

from sklearn.datasets import load_digits


images = digits.images
labels = digits.target
images.shape

i = 102

plt.figure(figsize=(2, 2))
plt.title('{0}'.format(labels[i]))
plt.axis('off')
plt.imshow(images[i], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()


n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

data.shape

X = data
y = digits.target

X.shape


X_5 = X[y == 5]

X_5.shape

fig, axes = plt.subplots(1, 5, figsize=(10, 4))
for ax, x in zip(axes, X_5[:5]):
img = x.reshape(8, 8)
ax.imshow(img, cmap=plt.cm.gray_r, interpolation='nearest')
ax.axis('off')

• 让我们使用IsolationForest来查找前 5% 最异常的图像。
• 让我们绘制他们吧！
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(contamination=0.05)
iforest = iforest.fit(X_5)


iforest_X = iforest.decision_function(X_5)
plt.hist(iforest_X);


X_strong_inliers = X_5[np.argsort(iforest_X)[-10:]]

fig, axes = plt.subplots(2, 5, figsize=(10, 5))

for i, ax in zip(range(len(X_strong_inliers)), axes.ravel()):
ax.imshow(X_strong_inliers[i].reshape((8, 8)),
cmap=plt.cm.gray_r, interpolation='nearest')
ax.axis('off')


fig, axes = plt.subplots(2, 5, figsize=(10, 5))

X_outliers = X_5[iforest.predict(X_5) == -1]

for i, ax in zip(range(len(X_outliers)), axes.ravel()):
ax.imshow(X_outliers[i].reshape((8, 8)),
cmap=plt.cm.gray_r, interpolation='nearest')
ax.axis('off')


# %load solutions/22_C-anomaly_digits.py


# 二十三、核外学习 - 用于语义分析的大规模文本分类

## 可扩展性问题

sklearn.feature_extraction.text.CountVectorizersklearn.feature_extraction.text.TfidfVectorizer类受到许多可伸缩性问题的困扰，这些问题都源于vocabulary_属性（Python 字典）的内部使用，它用于将 unicode 字符串特征名称映射为整数特征索引。

• 文本向量化程序的内存使用情况：所有特征的字符串表示形式都加载到内存中
• 文本特征提取的并行化问题：vocabulary_是一个共享状态：复杂的同步和开销
• 不可能进行在线或核外/流式学习：vocabulary_需要从数据中学习：在遍历一次整个数据集之前无法知道其大小

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)

vectorizer.fit([
"The cat sat on the mat.",
])
vectorizer.vocabulary_


transform的时候，使用词汇表来构建出现矩阵：

X = vectorizer.transform([
"The cat sat on the mat.",
"This cat is a nice cat.",
]).toarray()

print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)


vectorizer = CountVectorizer(min_df=1)

vectorizer.fit([
"The cat sat on the mat.",
"The quick brown fox jumps over the lazy dog.",
])
vectorizer.vocabulary_


vocabulary_随着训练语料库的大小而（以对数方式）增长。 请注意，我们无法在 2 个文本文档上并行构建词汇表，因为它们共享一些单词，因此需要某种共享数据结构或同步障碍，这对于设定来说很复杂，特别是如果我们想要将处理过程分发给集群的时候。

X = vectorizer.transform([
"The cat sat on the mat.",
"This cat is a nice cat.",
]).toarray()

print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)


## IMDB 电影数据集

A. L. Maas, R. E. Daly, P. T. Pham, D. Huang, A. Y. Ng, and C. Potts. Learning Word Vectors for Sentiment Analysis. In the proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pages 142–150, Portland, Oregon, USA, June 2011. Association for Computational Linguistics.

import os

train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')


from sklearn.datasets import load_files

categories=['pos', 'neg'])

categories=['pos', 'neg'])


load_files函数将数据集加载到sklearn.datasets.base.Bunch对象中，这些对象是 Python 字典：

train.keys()


import numpy as np

for label, data in zip(('TRAINING', 'TEST'), (train, test)):
print('\n\n%s' % label)
print('Number of documents:', len(data['data']))
print('\n1st document:\n', data['data'][0])
print('\n1st label:', data['target'][0])
print('\nClass names:', data['target_names'])
print('Class count:',
np.unique(data['target']), ' -> ',
np.bincount(data['target']))


## 哈希技巧

from sklearn.utils.murmurhash import murmurhash3_bytes_u32

# encode for python 3 compatibility
for word in "the cat sat on the mat".encode("utf-8").split():
print("{0} => {1}".format(
word, murmurhash3_bytes_u32(word, 0) % 2 ** 20))


HashingVectorizer类是CountVectorizer（或use_idf=FalseTfidfVectorizer类）的替代品，它在内部使用 murmurhash 哈希函数：

from sklearn.feature_extraction.text import HashingVectorizer

h_vectorizer = HashingVectorizer(encoding='latin-1')
h_vectorizer


analyzer = h_vectorizer.build_analyzer()
analyzer('This is a test sentence.')


docs_train, y_train = train['data'], train['target']
docs_valid, y_valid = test['data'][:12500], test['target'][:12500]
docs_test, y_test = test['data'][12500:], test['target'][12500:]


h_vectorizer.transform(docs_train)


h_vec = HashingVectorizer(encoding='latin-1')
%timeit -n 1 -r 3 h_vec.fit(docs_train, y_train)

count_vec =  CountVectorizer(encoding='latin-1')
%timeit -n 1 -r 3 count_vec.fit(docs_train, y_train)


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

h_pipeline = Pipeline([
('vec', HashingVectorizer(encoding='latin-1')),
('clf', LogisticRegression(random_state=1)),
])

h_pipeline.fit(docs_train, y_train)

print('Train accuracy', h_pipeline.score(docs_train, y_train))
print('Validation accuracy', h_pipeline.score(docs_valid, y_valid))

import gc

del count_vec
del h_pipeline

gc.collect()


## 核外学习

train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
train_pos = os.path.join(train_path, 'pos')
train_neg = os.path.join(train_path, 'neg')

fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
[os.path.join(train_neg, f) for f in os.listdir(train_neg)]

fnames[:3]


y_train = np.zeros((len(fnames), ), dtype=int)
y_train[:12500] = 1
np.bincount(y_train)


from sklearn.base import clone

def batch_train(clf, fnames, labels, iterations=25, batchsize=1000, random_seed=1):
vec = HashingVectorizer(encoding='latin-1')
idx = np.arange(labels.shape[0])
c_clf = clone(clf)
rng = np.random.RandomState(seed=random_seed)

for i in range(iterations):
rnd_idx = rng.choice(idx, size=batchsize)
documents = []
for i in rnd_idx:
with open(fnames[i], 'r', encoding='latin-1') as f:
X_batch = vec.transform(documents)
batch_labels = labels[rnd_idx]
c_clf.partial_fit(X=X_batch,
y=batch_labels,
classes=[0, 1])

return c_clf


from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss='log', random_state=1, max_iter=1000)

sgd = batch_train(clf=sgd,
fnames=fnames,
labels=y_train)


vec = HashingVectorizer(encoding='latin-1')
sgd.score(vec.transform(docs_test), y_test)


## 哈希向量化器的限制

• 碰撞会在数据中引入太多噪声并降低预测质量，
• HashingVectorizer不提供“反向文档频率”重新加权（缺少use_idf=True选项）。
• 没有反转映射，和从特征索引中查找特征名称的简单方法。
• 可以通过增加n_features参数来控制冲突问题。

# %load solutions/23_batchtrain.py
`

|
1月前
|

sklearn算法
sklearn算法
26 0
|
1月前
|

30 6
|
1月前
|

22 0
|
6月前
|

sklearn
sklearn 是一个基于 Python 的机器学习库，它提供了大量的机器学习算法和工具，旨在帮助数据科学家和机器学习工程师快速、简单地实现和测试机器学习模型。
60 2
|

sklearn相关3
sklearn介绍
92 1
|

Sklearn介绍2
Sklearn介绍
60 0
|
API Python
Sklearn介绍3
Sklearn介绍
78 0
|

Sklearn介绍1
Sklearn介绍
131 0
|

87 0
|

sklearn：sklearn.GridSearchCV函数的简介、使用方法之详细攻略
sklearn：sklearn.GridSearchCV函数的简介、使用方法之详细攻略
194 0