# 5.8 决策树和随机森林

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

### 创建决策树

from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=300, centers=4,
random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(X, y)

def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
ax = ax or plt.gca()

# Plot the training points
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# fit the estimator
model.fit(X, y)
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap, clim=(y.min(), y.max()),
zorder=1)

ax.set(xlim=xlim, ylim=ylim)

visualize_classifier(DecisionTreeClassifier(), X, y)

# helpers_05_08 is found in the online appendix
import helpers_05_08
helpers_05_08.plot_tree_interactive(X, y);

### 决策树和过拟合

# helpers_05_08 is found in the online appendix
import helpers_05_08
helpers_05_08.randomized_tree_interactive(X, y)

## 估计器的组合：随机森林

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8,
random_state=1)

bag.fit(X, y)
visualize_classifier(bag, X, y)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_classifier(model, X, y);

## 随机森林回归

rng = np.random.RandomState(42)
x = 10 * rng.rand(200)

def model(x, sigma=0.3):
fast_oscillation = np.sin(5 * x)
slow_oscillation = np.sin(0.5 * x)
noise = sigma * rng.randn(len(x))

return slow_oscillation + fast_oscillation + noise

y = model(x)
plt.errorbar(x, y, 0.3, fmt='o');

## 示例：随机森林数字分类

from sklearn.datasets import load_digits
digits.keys()
# dict_keys(['target', 'data', 'target_names', 'DESCR', 'images'])

# set up the figure
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')

# label the image with the target value
ax.text(0, 7, str(digits.target[i]))

from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target,
random_state=0)
model = RandomForestClassifier(n_estimators=1000)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

from sklearn import metrics
print(metrics.classification_report(ypred, ytest))
             precision    recall  f1-score   support

0       1.00      0.97      0.99        38
1       1.00      0.98      0.99        44
2       0.95      1.00      0.98        42
3       0.98      0.96      0.97        46
4       0.97      1.00      0.99        37
5       0.98      0.96      0.97        49
6       1.00      1.00      1.00        52
7       1.00      0.96      0.98        50
8       0.94      0.98      0.96        46
9       0.96      0.98      0.97        46

avg / total       0.98      0.98      0.98       450

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(ytest, ypred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

## 随机森林总结

• 训练和预测都非常快，因为底层决策树简单。 此外，两个任务都可以直接并行化，因为各个树是完全独立的实体。
• 多个树提供了概率分类：估计器之间的多数表决提供了概率估计（在 Scikit-Learn 中使用predict_proba()方法来访问）。
• 非参数模型是非常灵活的，因此可以在其他估计器拟合不足的任务上表现良好。

|
9天前
|

GitHub高赞！速通Python编程基础手册，被玩出花了！

16 2
|
4天前
|

python常用算法（5）——树，二叉树与AVL树（一）
python常用算法（5）——树，二叉树与AVL树
8 1
|
6天前
|

Python中的决策树算法探索
Python中的决策树算法探索
18 2
|
7天前
|
Python
GitHub爆赞！终于有大佬把《Python学习手册》学习笔记分享出来了

20 3
|
8天前
|

Python在数据科学中的应用与前景

|
9天前
|
Python
GitHub爆赞！终于有大佬把《Python学习手册》学习笔记分享出来了

14 2
|
13天前
|
IDE 开发工具 C++
Python-turtle-樱花树
Python-turtle-樱花树
14 3
|
3天前
|

Python梯度提升决策树的方法示例

7 0
|
4天前
|

python常用算法（5）——树，二叉树与AVL树（三）
python常用算法（5）——树，二叉树与AVL树
10 0
|
4天前
|

python常用算法（5）——树，二叉树与AVL树（二）
python常用算法（5）——树，二叉树与AVL树
6 0