EX 4: Classifier Comparison

分類法/範例四: Classifier comparison

這個範例的主要目的
    比較各種分類器
    利用圖示法觀察各種分類器的分類邊界及區域

(一)引入函式並準備分類器

    將分類器引入之後存放入一個list
    這邊要注意 sklearn.discriminant_analysis 必需要 sklearn 0.17以上才能執行
1
import numpy as np
2
import matplotlib.pyplot as plt
3
from matplotlib.colors import ListedColormap
4
from sklearn.cross_validation import train_test_split
5
from sklearn.preprocessing import StandardScaler
6
from sklearn.datasets import make_moons, make_circles, make_classification
7
from sklearn.neighbors import KNeighborsClassifier
8
from sklearn.svm import SVC
9
from sklearn.tree import DecisionTreeClassifier
10
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
11
from sklearn.naive_bayes import GaussianNB
12
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
13
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
14
15
h = .02 # step size in the mesh
16
17
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
18
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Ana.",
19
"Quadratic Discriminant Ana."]
20
classifiers = [
21
KNeighborsClassifier(3),
22
SVC(kernel="linear", C=0.025),
23
SVC(gamma=2, C=1),
24
DecisionTreeClassifier(max_depth=5),
25
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
26
AdaBoostClassifier(),
27
GaussianNB(),
28
LinearDiscriminantAnalysis(),
29
QuadraticDiscriminantAnalysis()]
Copied!

(二)準備測試資料

    利用make_classification產生分類資料,n_features=2表示共有兩個特徵, n_informative=2 代表有兩個類別
    所產生之 X: 100 x 2矩陣,y: 100 元素之向量,y的數值僅有0或是1用來代表兩種類別
    利用X += 2 * rng.uniform(size=X.shape)加入適度的雜訊後將(X,y)資料集命名為linear_separable
    最後利用make_moon()make_circles()產生空間中月亮形狀及圓形之數據分佈後,一併存入datasets變數
1
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
2
random_state=1, n_clusters_per_class=1)
3
rng = np.random.RandomState(2)
4
X += 2 * rng.uniform(size=X.shape)
5
linearly_separable = (X, y)
6
7
datasets = [make_moons(noise=0.3, random_state=0),
8
make_circles(noise=0.2, factor=0.5, random_state=1),
9
linearly_separable
10
]
Copied!

(三)測試分類器並作圖

接下來這段程式碼有兩個for 迴圈,外迴圈走過三個的dataset,內迴圈則走過所有的分類器。 為求簡要說明,我們將程式碼簡略如下: 1. 外迴圈:資料迴圈。首先畫出資料分佈,接著將資料傳入分類器迴圈
1
for ds in datasets:
2
X, y = ds
3
#調整特徵值大小使其在特定範圍
4
X = StandardScaler().fit_transform(X)
5
#利用train_test_split將資料分成訓練集以及測試集
6
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
7
#產生資料網格來大範圍測試分類器,範例EX 3有詳述該用法
8
xx, yy = np.meshgrid(..........省略)
9
# 畫出訓練資料點
10
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
11
# 畫出測試資料點,用alpha=0.6將測試資料點畫的"淡"一些
12
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
Copied!
    1.
    內迴圈:分類器迴圈。測試分類準確度並繪製分類邊界及區域
    1
    for name, clf in zip(names, classifiers):
    2
    clf.fit(X_train, y_train)
    3
    score = clf.score(X_test, y_test)
    4
    5
    # Plot the decision boundary. For that, we will assign a color to each
    6
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    7
    if hasattr(clf, "decision_function"):
    8
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    9
    else:
    10
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    11
    12
    # Put the result into a color plot
    13
    Z = Z.reshape(xx.shape)
    14
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    Copied!
    為了顯示方便,我將原始碼的內圈改為 for name, clf in zip(names[0:4], classifiers[0:4]):只跑過前四個分類器。
1
%matplotlib inline
2
3
figure = plt.figure(figsize=(30,20), dpi=300)
4
i = 1
5
# iterate over datasets
6
for ds in datasets:
7
# preprocess dataset, split into training and test part
8
X, y = ds
9
X = StandardScaler().fit_transform(X)
10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
11
12
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
13
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
14
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
15
np.arange(y_min, y_max, h))
16
17
# just plot the dataset first
18
cm = plt.cm.RdBu
19
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
20
ax = plt.subplot(len(datasets), (len(classifiers) + 1)//2, i)
21
# Plot the training points
22
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
23
# and testing points
24
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
25
ax.set_xlim(xx.min(), xx.max())
26
ax.set_ylim(yy.min(), yy.max())
27
ax.set_xticks(())
28
ax.set_yticks(())
29
i += 1
30
31
# iterate over classifiers
32
for name, clf in zip(names[0:4], classifiers[0:4]):
33
ax = plt.subplot(len(datasets), (len(classifiers) + 1)//2, i)
34
clf.fit(X_train, y_train)
35
score = clf.score(X_test, y_test)
36
37
# Plot the decision boundary. For that, we will assign a color to each
38
# point in the mesh [x_min, m_max]x[y_min, y_max].
39
if hasattr(clf, "decision_function"):
40
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
41
else:
42
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
43
44
# Put the result into a color plot
45
Z = Z.reshape(xx.shape)
46
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
47
48
# Plot also the training points
49
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
50
# and testing points
51
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
52
alpha=0.6)
53
54
ax.set_xlim(xx.min(), xx.max())
55
ax.set_ylim(yy.min(), yy.max())
56
ax.set_xticks(())
57
ax.set_yticks(())
58
ax.set_title(name,fontsize=28)
59
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
60
size=30, horizontalalignment='right')
61
i += 1
62
63
figure.subplots_adjust(left=.02, right=.98)
64
plt.show()
Copied!
png

(四) 原始碼列表

Python source code: plot_classifier_comparison.py
1
print(__doc__)
2
3
4
# Code source: Gaël Varoquaux
5
# Andreas Müller
6
# Modified for documentation by Jaques Grobler
7
# License: BSD 3 clause
8
9
import numpy as np
10
import matplotlib.pyplot as plt
11
from matplotlib.colors import ListedColormap
12
from sklearn.cross_validation import train_test_split
13
from sklearn.preprocessing import StandardScaler
14
from sklearn.datasets import make_moons, make_circles, make_classification
15
from sklearn.neighbors import KNeighborsClassifier
16
from sklearn.svm import SVC
17
from sklearn.tree import DecisionTreeClassifier
18
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
19
from sklearn.naive_bayes import GaussianNB
20
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
21
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
22
23
h = .02 # step size in the mesh
24
25
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
26
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
27
"Quadratic Discriminant Analysis"]
28
classifiers = [
29
KNeighborsClassifier(3),
30
SVC(kernel="linear", C=0.025),
31
SVC(gamma=2, C=1),
32
DecisionTreeClassifier(max_depth=5),
33
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
34
AdaBoostClassifier(),
35
GaussianNB(),
36
LinearDiscriminantAnalysis(),
37
QuadraticDiscriminantAnalysis()]
38
39
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
40
random_state=1, n_clusters_per_class=1)
41
rng = np.random.RandomState(2)
42
X += 2 * rng.uniform(size=X.shape)
43
linearly_separable = (X, y)
44
45
datasets = [make_moons(noise=0.3, random_state=0),
46
make_circles(noise=0.2, factor=0.5, random_state=1),
47
linearly_separable
48
]
49
50
figure = plt.figure(figsize=(27, 9))
51
i = 1
52
# iterate over datasets
53
for ds in datasets:
54
# preprocess dataset, split into training and test part
55
X, y = ds
56
X = StandardScaler().fit_transform(X)
57
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
58
59
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
60
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
61
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
62
np.arange(y_min, y_max, h))
63
64
# just plot the dataset first
65
cm = plt.cm.RdBu
66
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
67
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
68
# Plot the training points
69
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
70
# and testing points
71
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
72
ax.set_xlim(xx.min(), xx.max())
73
ax.set_ylim(yy.min(), yy.max())
74
ax.set_xticks(())
75
ax.set_yticks(())
76
i += 1
77
78
# iterate over classifiers
79
for name, clf in zip(names, classifiers):
80
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
81
clf.fit(X_train, y_train)
82
score = clf.score(X_test, y_test)
83
84
# Plot the decision boundary. For that, we will assign a color to each
85
# point in the mesh [x_min, m_max]x[y_min, y_max].
86
if hasattr(clf, "decision_function"):
87
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
88
else:
89
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
90
91
# Put the result into a color plot
92
Z = Z.reshape(xx.shape)
93
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
94
95
# Plot also the training points
96
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
97
# and testing points
98
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
99
alpha=0.6)
100
101
ax.set_xlim(xx.min(), xx.max())
102
ax.set_ylim(yy.min(), yy.max())
103
ax.set_xticks(())
104
ax.set_yticks(())
105
ax.set_title(name)
106
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
107
size=15, horizontalalignment='right')
108
i += 1
109
110
figure.subplots_adjust(left=.02, right=.98)
111
plt.show()
Copied!