http://scikit-learn.org/stable/auto_examples/classification/plot_lda_qda.html
這個範例引入的套件,主要特點在: 1. scipy.linalg
:線性代數相關函式,這裏主要使用到linalg.eigh 特徵值相關問題 2. matplotlib.colors
: 用來處理繪圖時的色彩分佈 3. LinearDiscriminantAnalysis
:線性判別演算法 4. QuadraticDiscriminantAnalysis
:二次判別演算法
%matplotlib inlinefrom scipy import linalgimport numpy as npimport matplotlib.pyplot as pltimport matplotlib as mplfrom matplotlib import colorsfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
接下來是設定一個線性變化的colormap,LinearSegmentedColormap(name, segmentdata)
預設會傳回一個256個值的數值顏色對應關係。用一個具備有三個項目的dict變數segmentdata
來設定。以'red': [(0, 1, 1), (1, 0.7, 0.7)]
來解釋,就是我們希望數值由0到1的過程,紅色通道將由1線性變化至0.7。
cmap = colors.LinearSegmentedColormap('red_blue_classes',{'red': [(0, 1, 1), (1, 0.7, 0.7)],'green': [(0, 0.7, 0.7), (1, 0.7, 0.7)],'blue': [(0, 0.7, 0.7), (1, 1, 1)]})plt.cm.register_cmap(cmap=cmap)
我們可以用以下程式碼來觀察。當輸入數值為np.arange(0,1.1,0.1)
也就是0,0.1...,1.0 時RGB數值的變化情形。
values = np.arange(0,1.1,0.1)cmap_values = mpl.cm.get_cmap('red_blue_classes')(values)import pandas as pdpd.set_option('precision',2)df=pd.DataFrame(np.hstack((values.reshape(11,1),cmap_values)))df.columns = ['Value', 'R', 'G', 'B', 'Alpha']print(df)
Value R G B Alpha0 0.0 1.0 0.7 0.7 11 0.1 1.0 0.7 0.7 12 0.2 0.9 0.7 0.8 13 0.3 0.9 0.7 0.8 14 0.4 0.9 0.7 0.8 15 0.5 0.8 0.7 0.9 16 0.6 0.8 0.7 0.9 17 0.7 0.8 0.7 0.9 18 0.8 0.8 0.7 0.9 19 0.9 0.7 0.7 1.0 110 1.0 0.7 0.7 1.0 1
接著我們產生兩組資料, 每組資料有 600筆資料,2個特徵 X: 600x2
以及2個類別 y:600 (前300個元素為0,餘下為1)
: 1. dataset_fixed_cov()
:2個類別的特徵具備有相同共變數(covariance) 2. dataset_fixed_cov()
:2個類別的特徵具備有不同之共變數 差異落在X資料的產生np.dot(np.random.randn(n, dim), C)
與 np.dot(np.random.randn(n, dim), C.T)
的不同。np.dot(np.random.randn(n, dim), C)
會產生300x2
之矩陣,其亂數產生的範圍可交由C
矩陣來控制。在dataset_fixed_cov()
中,前後300筆資料產生之範圍皆由C
來調控。我們可以在最下方的結果圖示看到上排影像(相同共變數)的資料分佈無論是紅色(代表類別1)以及藍色(代表類別2)其分佈形狀相似。而下排影像(不同共變數),分佈形狀則不同。圖示中,橫軸及縱軸分別表示第一及第二個特徵,讀者可以試著將 0.83
這個數字減少或是將C.T
改成C
,看看最後結果圖形有了什麼改變?
def dataset_fixed_cov():'''Generate 2 Gaussians samples with the same covariance matrix'''n, dim = 300, 2np.random.seed(0)C = np.array([[0., -0.23], [0.83, .23]])X = np.r_[np.dot(np.random.randn(n, dim), C),np.dot(np.random.randn(n, dim), C) + np.array([1, 1])] #利用 + np.array([1, 1]) 產生類別間的差異y = np.hstack((np.zeros(n), np.ones(n))) #產生300個零及300個1並連接起來return X, ydef dataset_cov():'''Generate 2 Gaussians samples with different covariance matrices'''n, dim = 300, 2np.random.seed(0)C = np.array([[0., -1.], [2.5, .7]]) * 2.X = np.r_[np.dot(np.random.randn(n, dim), C),np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])]y = np.hstack((np.zeros(n), np.ones(n)))return X, y
找出 True Positive及False Negative 之辨認點
以紅色及藍色分別表示分類為 0及1的資料點,而以深紅跟深藍來表示誤判資料
以lda.predict_proba()
畫出分類的機率分佈(請參考範例三)
(為了方便在ipython notebook環境下顯示,下面函式有經過微調)
def plot_data(lda, X, y, y_pred, fig_index):splot = plt.subplot(2, 2, fig_index)if fig_index == 1:plt.title('Linear Discriminant Analysis',fontsize=28)plt.ylabel('Data with fixed covariance',fontsize=28)elif fig_index == 2:plt.title('Quadratic Discriminant Analysis',fontsize=28)elif fig_index == 3:plt.ylabel('Data with varying covariances',fontsize=28)# 步驟一:找出 True Positive及False postive 之辨認點tp = (y == y_pred) # True Positivetp0, tp1 = tp[y == 0], tp[y == 1] #tp0 代表分類為0且列為 True Positive之資料點X0, X1 = X[y == 0], X[y == 1]X0_tp, X0_fp = X0[tp0], X0[~tp0]X1_tp, X1_fp = X1[tp1], X1[~tp1]# 步驟二:以紅藍來畫出分類資料,以深紅跟深藍來表示誤判資料# class 0: dotsplt.plot(X0_tp[:, 0], X0_tp[:, 1], 'o', color='red')plt.plot(X0_fp[:, 0], X0_fp[:, 1], '.', color='#990000') # dark red# class 1: dotsplt.plot(X1_tp[:, 0], X1_tp[:, 1], 'o', color='blue')plt.plot(X1_fp[:, 0], X1_fp[:, 1], '.', color='#000099') # dark blue#步驟三:畫出分類的機率分佈(請參考範例三)# class 0 and 1 : areasnx, ny = 200, 100x_min, x_max = plt.xlim()y_min, y_max = plt.ylim()xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])Z = Z[:, 1].reshape(xx.shape)plt.pcolormesh(xx, yy, Z, cmap='red_blue_classes',norm=colors.Normalize(0., 1.))plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k')# meansplt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10)plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10)return splot
def plot_ellipse(splot, mean, cov, color):v, w = linalg.eigh(cov)u = w[0] / linalg.norm(w[0])angle = np.arctan(u[1] / u[0])angle = 180 * angle / np.pi # convert to degrees# filled Gaussian at 2 standard deviationell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,180 + angle, color=color)ell.set_clip_box(splot.bbox)ell.set_alpha(0.5)splot.add_artist(ell)splot.set_xticks(())splot.set_yticks(())
def plot_lda_cov(lda, splot):plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')def plot_qda_cov(qda, splot):plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')###############################################################################figure = plt.figure(figsize=(30,20), dpi=300)for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):# Linear Discriminant Analysislda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)y_pred = lda.fit(X, y).predict(X)splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)plot_lda_cov(lda, splot)plt.axis('tight')# Quadratic Discriminant Analysisqda = QuadraticDiscriminantAnalysis(store_covariances=True)y_pred = qda.fit(X, y).predict(X)splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)plot_qda_cov(qda, splot)plt.axis('tight')plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis',fontsize=28)plt.show()
Python source code: plot_lda_qda.py
http://scikit-learn.org/stable/_downloads/plot_lda_qda.py
print(__doc__)from scipy import linalgimport numpy as npimport matplotlib.pyplot as pltimport matplotlib as mplfrom matplotlib import colorsfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis################################################################################ colormapcmap = colors.LinearSegmentedColormap('red_blue_classes',{'red': [(0, 1, 1), (1, 0.7, 0.7)],'green': [(0, 0.7, 0.7), (1, 0.7, 0.7)],'blue': [(0, 0.7, 0.7), (1, 1, 1)]})plt.cm.register_cmap(cmap=cmap)################################################################################ generate datasetsdef dataset_fixed_cov():'''Generate 2 Gaussians samples with the same covariance matrix'''n, dim = 300, 2np.random.seed(0)C = np.array([[0., -0.23], [0.83, .23]])X = np.r_[np.dot(np.random.randn(n, dim), C),np.dot(np.random.randn(n, dim), C) + np.array([1, 1])]y = np.hstack((np.zeros(n), np.ones(n)))return X, ydef dataset_cov():'''Generate 2 Gaussians samples with different covariance matrices'''n, dim = 300, 2np.random.seed(0)C = np.array([[0., -1.], [2.5, .7]]) * 2.X = np.r_[np.dot(np.random.randn(n, dim), C),np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])]y = np.hstack((np.zeros(n), np.ones(n)))return X, y################################################################################ plot functionsdef plot_data(lda, X, y, y_pred, fig_index):splot = plt.subplot(2, 2, fig_index)if fig_index == 1:plt.title('Linear Discriminant Analysis')plt.ylabel('Data with fixed covariance')elif fig_index == 2:plt.title('Quadratic Discriminant Analysis')elif fig_index == 3:plt.ylabel('Data with varying covariances')tp = (y == y_pred) # True Positivetp0, tp1 = tp[y == 0], tp[y == 1]X0, X1 = X[y == 0], X[y == 1]X0_tp, X0_fp = X0[tp0], X0[~tp0]X1_tp, X1_fp = X1[tp1], X1[~tp1]# class 0: dotsplt.plot(X0_tp[:, 0], X0_tp[:, 1], 'o', color='red')plt.plot(X0_fp[:, 0], X0_fp[:, 1], '.', color='#990000') # dark red# class 1: dotsplt.plot(X1_tp[:, 0], X1_tp[:, 1], 'o', color='blue')plt.plot(X1_fp[:, 0], X1_fp[:, 1], '.', color='#000099') # dark blue# class 0 and 1 : areasnx, ny = 200, 100x_min, x_max = plt.xlim()y_min, y_max = plt.ylim()xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])Z = Z[:, 1].reshape(xx.shape)plt.pcolormesh(xx, yy, Z, cmap='red_blue_classes',norm=colors.Normalize(0., 1.))plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k')# meansplt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10)plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10)return splotdef plot_ellipse(splot, mean, cov, color):v, w = linalg.eigh(cov)u = w[0] / linalg.norm(w[0])angle = np.arctan(u[1] / u[0])angle = 180 * angle / np.pi # convert to degrees# filled Gaussian at 2 standard deviationell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,180 + angle, color=color)ell.set_clip_box(splot.bbox)ell.set_alpha(0.5)splot.add_artist(ell)splot.set_xticks(())splot.set_yticks(())def plot_lda_cov(lda, splot):plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')def plot_qda_cov(qda, splot):plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')###############################################################################for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):# Linear Discriminant Analysislda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)y_pred = lda.fit(X, y).predict(X)splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)plot_lda_cov(lda, splot)plt.axis('tight')# Quadratic Discriminant Analysisqda = QuadraticDiscriminantAnalysis(store_covariances=True)y_pred = qda.fit(X, y).predict(X)splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)plot_qda_cov(qda, splot)plt.axis('tight')plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis')plt.show()
引入函式如下:
numpy : 產生陣列數值
matplotlib.pyplot : 用來繪製影像
sklearn import datasets, cluster : datasets : 用來繪入內建之手寫數字資料庫 ; cluster : 其內收集非監督clustering演算法
sklearn.feature_extraction.image import grid_to_graph : 定義資料的結構