Ex 4: Imputing missing values before building an estimator

# (一)引入函式庫及內建測試資料庫

1. 1.
`sklearn.ensemble.RandomForestRegressor`: 隨機森林回歸
2. 2.
`sklearn.pipeline.Pipeline`: 串聯估計器
3. 3.
`sklearn.preprocessing.Imputer`: 缺失值填充
4. 4.
`sklearn.cross_validation import cross_val_score`:交叉驗證

# (二)引入內建測試資料庫(boston房產資料)

1
2
X_full, y_full = dataset.data, dataset.target
3
n_samples = X_full.shape
4
n_features = X_full.shape
Copied!

('data', (506, 13))

('feature_names', (13,))

('target', (506,))

DESCR

# (三)利用整個數據集來預測

Score with the entire dataset = 0.56
1
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
2
score = cross_val_score(estimator, X_full, y_full).mean()
3
print("Score with the entire dataset = %.2f" % score)
Copied!

# (四)模擬資料損失時之預測情形

Score without the samples containing missing values = 0.49
1
missing_rate = 0.75
2
n_missing_samples = np.floor(n_samples * missing_rate)
3
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
4
dtype=np.bool),
5
np.ones(n_missing_samples,
6
dtype=np.bool)))
7
rng.shuffle(missing_samples)
8
missing_features = rng.randint(0, n_features, n_missing_samples)
9
10
X_filtered = X_full[~missing_samples, :]
11
y_filtered = y_full[~missing_samples]
12
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
13
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
14
print("Score without the samples containing missing values = %.2f" % score)
Copied!

# (五)填充missing values，估計填充後的得分

1
class sklearn.preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
Copied!

1
X_missing = X_full.copy()
2
X_missing[np.where(missing_samples), missing_features] = 0
3
y_missing = y_full.copy()
4
estimator = Pipeline([("imputer", Imputer(missing_values=0,
5
strategy="mean",
6
axis=0)),
7
("forest", RandomForestRegressor(random_state=0,
8
n_estimators=100))])
9
score = cross_val_score(estimator, X_missing, y_missing).mean()
10
print("Score after imputation of the missing values = %.2f" % score)
Copied!

Score after imputation of the missing values = 0.57

# (六)完整程式碼

1
import numpy as np
2
3
4
from sklearn.ensemble import RandomForestRegressor
5
from sklearn.pipeline import Pipeline
6
from sklearn.preprocessing import Imputer
7
from sklearn.cross_validation import cross_val_score
8
9
rng = np.random.RandomState(0)
10
11
12
X_full, y_full = dataset.data, dataset.target
13
n_samples = X_full.shape
14
n_features = X_full.shape
15
16
# Estimate the score on the entire dataset, with no missing values
17
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
18
score = cross_val_score(estimator, X_full, y_full).mean()
19
print("Score with the entire dataset = %.2f" % score)
20
21
# Add missing values in 75% of the lines
22
missing_rate = 0.75
23
n_missing_samples = np.floor(n_samples * missing_rate)
24
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
25
dtype=np.bool),
26
np.ones(n_missing_samples,
27
dtype=np.bool)))
28
rng.shuffle(missing_samples)
29
missing_features = rng.randint(0, n_features, n_missing_samples)
30
31
# Estimate the score without the lines containing missing values
32
X_filtered = X_full[~missing_samples, :]
33
y_filtered = y_full[~missing_samples]
34
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
35
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
36
print("Score without the samples containing missing values = %.2f" % score)
37
38
# Estimate the score after imputation of the missing values
39
X_missing = X_full.copy()
40
X_missing[np.where(missing_samples), missing_features] = 0
41
y_missing = y_full.copy()
42
estimator = Pipeline([("imputer", Imputer(missing_values=0,
43
strategy="mean",
44
axis=0)),
45
("forest", RandomForestRegressor(random_state=0,
46
n_estimators=100))])
47
score = cross_val_score(estimator, X_missing, y_missing).mean()
48
print("Score after imputation of the missing values = %.2f" % score)
Copied!
1
results:
2
Score with the entire dataset = 0.56
3
Score without the samples containing missing values = 0.48
4
Score after imputation of the missing values = 0.55
Copied!