互分解 Cross Decomposition

互分解 / 範例一:Compare cross decomposition methods

這個範例目的是比較幾個互分解的方法。互分解運算主要是使用潛在變量模式(Latent variable modeling)分析來尋找兩個矩陣之間的主要相關成份。 對比於外顯變量(Manifest variable),也就是一般的觀察變量(Observational variable),潛在變量是可能會影響實驗觀察的一個未知因素。

(一)引入函式庫及內建手寫數字資料庫

引入之函式庫如下
    1.
    matplotlib.pyplot: 用來繪製影像
    2.
    sklearn.cross_decomposition: 互分解物件
    3.
    PLSCanonical: Partial Least Squares 淨最小平方法
    4.
    PLSRegression: PLS 淨最小平方迴歸法
    5.
    CCA: Canonical correlation analysis 典型相關分析
1
import numpy as np
2
import matplotlib.pyplot as plt
3
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
4
5
#首先產生500筆常態分佈資料
6
n = 500
7
# 共有兩個潛在變量:
8
l1 = np.random.normal(size=n)
9
l2 = np.random.normal(size=n)
10
11
# np.array([l1, l1, l2, l2]).shape = (4L, 500L)
12
# latents 為 500 x 4 之矩陣
13
latents = np.array([l1, l1, l2, l2]).T
14
15
#接下來加入亂數形成X, Y矩陣
16
X = latents + np.random.normal(size=4 * n).reshape((n, 4))
17
Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
18
19
X_train = X[:n / 2]
20
Y_train = Y[:n / 2]
21
X_test = X[n / 2:]
22
Y_test = Y[n / 2:]
23
24
# numpy.corrcoef(x, y=None) 用來記算 Pearson product-moment 相關係數
25
print("Corr(X)")
26
print(np.round(np.corrcoef(X.T), 2))
27
print("Corr(Y)")
28
print(np.round(np.corrcoef(Y.T), 2))
Copied!
1
Corr(X)
2
[[ 1. 0.48 0.02 0. ]
3
[ 0.48 1. 0.02 -0.02]
4
[ 0.02 0.02 1. 0.51]
5
[ 0. -0.02 0.51 1. ]]
6
Corr(Y)
7
[[ 1. 0.49 -0.01 0.05]
8
[ 0.49 1. -0.06 0.06]
9
[-0.01 -0.06 1. 0.53]
10
[ 0.05 0.06 0.53 1. ]]
Copied!
1
# Canonical (symmetric) PLS
2
3
# Transform data
4
# ~~~~~~~~~~~~~~
5
plsca = PLSCanonical(n_components=2)
6
plsca.fit(X_train, Y_train)
7
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
8
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
9
10
# Scatter plot of scores
11
# ~~~~~~~~~~~~~~~~~~~~~~
12
# 1) On diagonal plot X vs Y scores on each components
13
#figure = plt.figure(figsize=(30,20), dpi=300)
14
plt.figure(figsize=(12, 8), dpi=600)
15
plt.subplot(221)
16
plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train")
17
plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test")
18
plt.xlabel("x scores")
19
plt.ylabel("y scores")
20
plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
21
np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
22
plt.xticks(())
23
plt.yticks(())
24
plt.legend(loc="best")
25
26
plt.subplot(224)
27
plt.plot(X_train_r[:, 1], Y_train_r[:, 1], "ob", label="train")
28
plt.plot(X_test_r[:, 1], Y_test_r[:, 1], "or", label="test")
29
plt.xlabel("x scores")
30
plt.ylabel("y scores")
31
plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
32
np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
33
plt.xticks(())
34
plt.yticks(())
35
plt.legend(loc="best")
36
37
# 2) Off diagonal plot components 1 vs 2 for X and Y
38
plt.subplot(222)
39
plt.plot(X_train_r[:, 0], X_train_r[:, 1], "*b", label="train")
40
plt.plot(X_test_r[:, 0], X_test_r[:, 1], "*r", label="test")
41
plt.xlabel("X comp. 1")
42
plt.ylabel("X comp. 2")
43
plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
44
% np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
45
plt.legend(loc="best")
46
plt.xticks(())
47
plt.yticks(())
48
49
plt.subplot(223)
50
plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train")
51
plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test")
52
plt.xlabel("Y comp. 1")
53
plt.ylabel("Y comp. 2")
54
plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
55
% np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
56
plt.legend(loc="best")
57
plt.xticks(())
58
plt.yticks(())
59
plt.show()
Copied!
png
1
###############################################################################
2
# PLS regression, with multivariate response, a.k.a. PLS2
3
4
n = 1000
5
q = 3
6
p = 10
7
X = np.random.normal(size=n * p).reshape((n, p))
8
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
9
# each Yj = 1*X1 + 2*X2 + noize
10
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5
11
12
pls2 = PLSRegression(n_components=3)
13
pls2.fit(X, Y)
14
print("True B (such that: Y = XB + Err)")
15
print(B)
16
# compare pls2.coef_ with B
17
print("Estimated B")
18
print(np.round(pls2.coef_, 1))
19
pls2.predict(X)
20
21
###############################################################################
22
# PLS regression, with univariate response, a.k.a. PLS1
23
24
n = 1000
25
p = 10
26
X = np.random.normal(size=n * p).reshape((n, p))
27
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
28
pls1 = PLSRegression(n_components=3)
29
pls1.fit(X, y)
30
# note that the number of compements exceeds 1 (the dimension of y)
31
print("Estimated betas")
32
print(np.round(pls1.coef_, 1))
33
34
###############################################################################
35
# CCA (PLS mode B with symmetric deflation)
36
37
cca = CCA(n_components=2)
38
cca.fit(X_train, Y_train)
39
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
40
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
Copied!
1
True B (such that: Y = XB + Err)
2
[[1 1 1]
3
[2 2 2]
4
[0 0 0]
5
[0 0 0]
6
[0 0 0]
7
[0 0 0]
8
[0 0 0]
9
[0 0 0]
10
[0 0 0]
11
[0 0 0]]
12
Estimated B
13
[[ 1. 1. 1. ]
14
[ 2. 1.9 2. ]
15
[ 0. 0. 0. ]
16
[ 0. 0. 0. ]
17
[ 0. 0. 0. ]
18
[ 0. 0. -0.1]
19
[ 0. 0. 0. ]
20
[ 0. 0. 0.1]
21
[ 0. 0. 0. ]
22
[ 0. 0. 0. ]]
23
Estimated betas
24
[[ 1. ]
25
[ 2. ]
26
[ 0. ]
27
[ 0. ]
28
[ 0. ]
29
[ 0. ]
30
[ 0. ]
31
[-0.1]
32
[ 0. ]
33
[ 0. ]]
Copied!

(四)完整程式碼

Python source code: plot_compare_cross_decomposition.py
1
print(__doc__)
2
3
import numpy as np
4
import matplotlib.pyplot as plt
5
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
6
7
###############################################################################
8
# Dataset based latent variables model
9
10
n = 500
11
# 2 latents vars:
12
l1 = np.random.normal(size=n)
13
l2 = np.random.normal(size=n)
14
15
latents = np.array([l1, l1, l2, l2]).T
16
X = latents + np.random.normal(size=4 * n).reshape((n, 4))
17
Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
18
19
X_train = X[:n / 2]
20
Y_train = Y[:n / 2]
21
X_test = X[n / 2:]
22
Y_test = Y[n / 2:]
23
24
print("Corr(X)")
25
print(np.round(np.corrcoef(X.T), 2))
26
print("Corr(Y)")
27
print(np.round(np.corrcoef(Y.T), 2))
28
29
###############################################################################
30
# Canonical (symmetric) PLS
31
32
# Transform data
33
# ~~~~~~~~~~~~~~
34
plsca = PLSCanonical(n_components=2)
35
plsca.fit(X_train, Y_train)
36
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
37
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
38
39
# Scatter plot of scores
40
# ~~~~~~~~~~~~~~~~~~~~~~
41
# 1) On diagonal plot X vs Y scores on each components
42
plt.figure(figsize=(12, 8))
43
plt.subplot(221)
44
plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train")
45
plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test")
46
plt.xlabel("x scores")
47
plt.ylabel("y scores")
48
plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
49
np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
50
plt.xticks(())
51
plt.yticks(())
52
plt.legend(loc="best")
53
54
plt.subplot(224)
55
plt.plot(X_train_r[:, 1], Y_train_r[:, 1], "ob", label="train")
56
plt.plot(X_test_r[:, 1], Y_test_r[:, 1], "or", label="test")
57
plt.xlabel("x scores")
58
plt.ylabel("y scores")
59
plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
60
np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
61
plt.xticks(())
62
plt.yticks(())
63
plt.legend(loc="best")
64
65
# 2) Off diagonal plot components 1 vs 2 for X and Y
66
plt.subplot(222)
67
plt.plot(X_train_r[:, 0], X_train_r[:, 1], "*b", label="train")
68
plt.plot(X_test_r[:, 0], X_test_r[:, 1], "*r", label="test")
69
plt.xlabel("X comp. 1")
70
plt.ylabel("X comp. 2")
71
plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
72
% np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
73
plt.legend(loc="best")
74
plt.xticks(())
75
plt.yticks(())
76
77
plt.subplot(223)
78
plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train")
79
plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test")
80
plt.xlabel("Y comp. 1")
81
plt.ylabel("Y comp. 2")
82
plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
83
% np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
84
plt.legend(loc="best")
85
plt.xticks(())
86
plt.yticks(())
87
plt.show()
88
89
###############################################################################
90
# PLS regression, with multivariate response, a.k.a. PLS2
91
92
n = 1000
93
q = 3
94
p = 10
95
X = np.random.normal(size=n * p).reshape((n, p))
96
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
97
# each Yj = 1*X1 + 2*X2 + noize
98
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5
99
100
pls2 = PLSRegression(n_components=3)
101
pls2.fit(X, Y)
102
print("True B (such that: Y = XB + Err)")
103
print(B)
104
# compare pls2.coef_ with B
105
print("Estimated B")
106
print(np.round(pls2.coef_, 1))
107
pls2.predict(X)
108
109
###############################################################################
110
# PLS regression, with univariate response, a.k.a. PLS1
111
112
n = 1000
113
p = 10
114
X = np.random.normal(size=n * p).reshape((n, p))
115
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
116
pls1 = PLSRegression(n_components=3)
117
pls1.fit(X, y)
118
# note that the number of compements exceeds 1 (the dimension of y)
119
print("Estimated betas")
120
print(np.round(pls1.coef_, 1))
121
122
###############################################################################
123
# CCA (PLS mode B with symmetric deflation)
124
125
cca = CCA(n_components=2)
126
cca.fit(X_train, Y_train)
127
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
128
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
Copied!