## ApacheCN_飞龙 / sklearn-cb

Explore and code with more than 2 million developers，Free private repositories ！：）
5.md 36.47 KB
ApacheCN_飞龙 authored 2017-06-22 15:46 . 6.22

# 第五章 模型后处理

## 5.1 K-fold 交叉验证

### 工作原理

>>> N = 1000
>>> holdout = 200
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(1000, shuffle=True) 

>>> X_h, y_h = X[:holdout], y[:holdout]
>>> X_t, y_t = X[holdout:], y[holdout:]
>>> from sklearn.cross_validation import KFold 

K-fold 给了我们一些选项，来选择我们想要多少个折叠，是否让值为下标或者布尔值，是否打算打乱数据集，最后是随机状态（主要出于再现性）。下标实际上会在之后的版本中溢出。假设它为True

>>> kfold = KFold(len(y_t), n_folds=4)

>>> output_string = "Fold: {}, N_train: {}, N_test: {}"

>>> for i, (train, test) in enumerate(kfold):
print output_string.format(i, len(y_t[train]), len(y_t[test]))

Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200

### 工作原理

>>> import numpy as np
>>> import pandas as pd

>>> patients = np.repeat(np.arange(0, 100, dtype=np.int8), 8)

>>> measurements = pd.DataFrame({'patient_id': patients,
'ys': np.random.normal(0, 1, 800)}) 

>>> custids = np.unique(measurements.patient_id)
>>> customer_kfold = KFold(custids.size, n_folds=4)

>>> output_string = "Fold: {}, N_train: {}, N_test: {}"

>>> for i, (train, test) in enumerate(customer_kfold):
train_cust_ids = custids[train]
training = measurements[measurements.patient_id.isin(
train_cust_ids)]
testing = measurements[~measurements.patient_id.isin(
train_cust_ids)]

print output_string.format(i, len(training), len(testing))

Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200 

## 5.2 自动化交叉验证

### 工作原理

>>> from sklearn import ensemble
>>> rf = ensemble.RandomForestRegressor(max_features='auto')

>>> from sklearn import datasets
>>> X, y = datasets.make_regression(10000, 10)

>>> from sklearn import cross_validation

>>> scores = cross_validation.cross_val_score(rf, X, y)

>>> print scores
[ 0.86823874  0.86763225  0.86986129]

## 工作原理

>>> scores = cross_validation.cross_val_score(rf, X, y, verbose=3,
cv=4)

[CV] no parameters to be set
[CV] no parameters to be set, score=0.872866 -   0.7s
[CV] no parameters to be set
[CV] no parameters to be set, score=0.873679 -   0.6s
[CV] no parameters to be set
[CV] no parameters to be set, score=0.878018 -   0.7s
[CV] no parameters to be set
[CV] no parameters to be set, score=0.871598 -   0.6s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.6s finished 

## 5.3 使用 ShuffleSplit 交叉验证

ShuffleSplit是最简单的交叉验证技巧之一。这个交叉验证技巧只是将数据的样本用于指定的迭代数量。

### 准备

ShuffleSplit是另一个简单的交叉验证技巧。我们会指定数据集中的总元素，并且它会考虑剩余部分。我们会浏览一个例子，估计单变量数据集的均值。这有点类似于重采样，但是它说明了一个原因，为什么我们在展示交叉验证的时候使用交叉验证。

### 操作步骤

>>> import numpy as np

>>> true_loc = 1000
>>> true_scale = 10
>>> N = 1000

>>> dataset = np.random.normal(true_loc, true_scale, N)

>>> import matplotlib.pyplot as plt

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.hist(dataset, color='k', alpha=.65, histtype='stepfilled');
>>> ax.set_title("Histogram of dataset");

>>> f.savefig("978-1-78398-948-5_06_06.png") 

NumPy 输出如下：

>>> from sklearn import cross_validation

>>> holdout_set = dataset[:500]
>>> fitting_set = dataset[500:]

>>> estimate = fitting_set[:N/2].mean()

>>> import matplotlib.pyplot as plt

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.set_title("True Mean vs Regular Estimate")

>>> ax.vlines(true_loc, 0, 1, color='r', linestyles='-', lw=5,
alpha=.65, label='true mean')
>>> ax.vlines(estimate, 0, 1, color='g', linestyles='-', lw=5,
alpha=.65, label='regular estimate')

>>> ax.set_xlim(999, 1001)

>>> ax.legend()

>>> f.savefig("978-1-78398-948-5_06_07.png") 


>>> from sklearn.cross_validation import ShuffleSplit

>>> shuffle_split = ShuffleSplit(len(fitting_set))

>>> mean_p = []

>>> for train, _ in shuffle_split:
mean_p.append(fitting_set[train].mean())
shuf_estimate = np.mean(mean_p)

>>> import matplotlib.pyplot as plt

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.vlines(true_loc, 0, 1, color='r', linestyles='-', lw=5,
alpha=.65, label='true mean')
>>> ax.vlines(estimate, 0, 1, color='g', linestyles='-', lw=5,
alpha=.65, label='regular estimate')
>>> ax.vlines(shuf_estimate, 0, 1, color='b', linestyles='-', lw=5,
alpha=.65, label='shufflesplit estimate')

>>> ax.set_title("All Estimates")
>>> ax.set_xlim(999, 1001)

>>> ax.legend(loc=3)

## 5.4 分层的 k-fold

### 准备

>>> from sklearn import datasets
>>> X, y = datasets.make_classification(n_samples=int(1e3),
weights=[1./11])


>>> y.mean()
0.90300000000000002

90.5% 的样本都是 1，其余为 0。

### 操作步骤

>>> from sklearn import cross_validation

>>> n_folds = 50

>>> strat_kfold = cross_validation.StratifiedKFold(y,
n_folds=n_folds)
>>> shuff_split = cross_validation.ShuffleSplit(n=len(y),
n_iter=n_folds)

>>> kfold_y_props = []
>>> shuff_y_props = []

>>> for (k_train, k_test), (s_train, s_test) in zip(strat_kfold,
>>> shuff_split):
kfold_y_props.append(y[k_train].mean())
shuff_y_props.append(y[s_train].mean()) 


>>> import matplotlib.pyplot as plt

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.plot(range(n_folds), shuff_y_props, label="ShuffleSplit",
color='k')
>>> ax.plot(range(n_folds), kfold_y_props, label="Stratified",
color='k', ls='--')
>>> ax.set_title("Comparing class proportions.")

>>> ax.legend(loc='best')


### 工作原理


>>> import numpy as np

>>> three_classes = np.random.choice([1,2,3], p=[.1, .4, .5],
size=1000)

>>> import itertools as it

>>> for train, test in cross_validation.StratifiedKFold(three_classes, 5):
print np.bincount(three_classes[train])

[  0  90 314 395]
[  0  90 314 395]
[  0  90 314 395]
[  0  91 315 395]
[  0  91 315 396]

## 5.5 菜鸟的网格搜索

### 准备

• 在参数空间中设计基本的搜索网格。

• 迭代网格并检查数据集的参数空间中的每个点的损失或评分函数。

• 选取参数空阿基那种的点，它使评分函数最大或者最小。

criteria = {gini, entropy}
max_features = {auto, log2, None}

>>> from sklearn import datasets
>>> X, y = datasets.make_classification(n_samples=2000, n_features=10)

### 操作步骤

import numpy as np
train_set = np.random.choice([True, False], size=len(y))
from sklearn.tree import DecisionTreeClassifier
accuracies = {}
for criterion, max_feature in parameter_space:
dt = DecisionTreeClassifier(criterion=criterion,
max_features=max_feature)
dt.fit(X[train_set], y[train_set])
accuracies[(criterion, max_feature)] = (dt.predict(X[~train_set])
== y[~train_set]).mean()
>>> accuracies
{('entropy', None): 0.974609375, ('entropy', 'auto'): 0.9736328125, ('entropy', 'log2'): 0.962890625, ('gini', None): 0.9677734375, ('gini', 'auto'): 0.9638671875, ('gini', 'log2'): 0.96875}

>>> from matplotlib import pyplot as plt
>>> from matplotlib import cm
>>> cmap = cm.RdBu_r
>>> f, ax = plt.subplots(figsize=(7, 4))
>>> ax.set_xticklabels([''] + list(criteria))
>>> ax.set_yticklabels([''] + list(max_features))
>>> plot_array = []
>>> for max_feature in max_features:
m = []
>>> for criterion in criteria:
m.append(accuracies[(criterion, max_feature)])
plot_array.append(m)
>>> colors = ax.matshow(plot_array, vmin=np.min(accuracies.values()) -
0.001, vmax=np.max(accuracies.values()) + 0.001, cmap=cmap)
>>> f.colorbar(colors) 

### 工作原理

1. 选取一系列参数
2. 迭代它们并求得每一步的准确率
3. 通过可视化来寻找最佳的表现

## 5.6 爆破网格搜索

### 准备

1. 创建一些数据集

2. 之后创建LogisticRegression 对象，训练我们的模型

3. 之后，我们创建搜索对象，GridSearch RandomizedSearchCV

### 工作原理

>>> from sklearn.datasets import make_classification
>>> X, y = make_classification(1000, n_features=5)

>>> from sklearn.linear_model import LogisticRegression
>>> lr = LogisticRegression(class_weight='auto') 

>>> lr.fit(X, y)

LogisticRegression(C=1.0, class_weight={0: 0.25, 1: 0.75}, dual=False,
fit_intercept=True, intercept_scaling=1,
penalty='l2', random_state=None, tol=0.0001)

>>> grid_search_params = {'penalty': ['l1', 'l2'],
'C': [1, 2, 3, 4]}

>>> import scipy.stats as st >>> import numpy as np
>>> random_search_params = {'penalty': ['l1', 'l2'],
'C': st.randint(1, 4)}


## 工作原理

>>> from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
>>> gs = GridSearchCV(lr, grid_search_params)

GridSearchCV 实现了和其他方法相同的 API：

>>> gs.fit(X, y)

GridSearchCV(cv=None, estimator=LogisticRegression(C=1.0,
class_weight='auto', dual=False, fit_intercept=True,
intercept_scaling=1, penalty='l2', random_state=None,
tol=0.0001), fit_params={}, iid=True, loss_func=None,
n_jobs=1, param_grid={'penalty': ['l1', 'l2'],
'C': [1, 2, 3, 4]}, pre_dispatch='2*n_jobs', refit=True,
score_func=None, scoring=None, verbose=0) 

>>> gs.grid_scores_
[mean: 0.90300, std: 0.01192, params: {'penalty': 'l1', 'C': 1},
mean: 0.90100, std: 0.01258, params: {'penalty': 'l2', 'C': 1},
mean: 0.90200, std: 0.01117, params: {'penalty': 'l1', 'C': 2},
mean: 0.90100, std: 0.01258, params: {'penalty': 'l2', 'C': 2},
mean: 0.90200, std: 0.01117, params: {'penalty': 'l1', 'C': 3},
mean: 0.90100, std: 0.01258, params: {'penalty': 'l2', 'C': 3},
mean: 0.90100, std: 0.01258, params: {'penalty': 'l1', 'C': 4},
mean: 0.90100, std: 0.01258, params: {'penalty': 'l2', 'C': 4}] 

>>> gs.grid_scores_[1][1]
0.90100000000000002

>>> max(gs.grid_scores_, key=lambda x: x[1])
mean: 0.90300, std: 0.01192, params: {'penalty': 'l1', 'C': 1}

## 5.7 使用伪造的估计器来比较结果

1. 创建一些随机数据

2. 训练多种伪造的估计器

### 操作步骤

>>> X, y = make_regression()

>>> from sklearn import dummy

>>> dumdum = dummy.DummyRegressor()

>>> dumdum.fit(X, y)

DummyRegressor(constant=None, strategy='mean')

>>> dumdum.predict(X)[:5]

array([ 2.23297907,  2.23297907,  2.23297907,  2.23297907,         2.23297907])

>>> predictors = [("mean", None),
("median", None),
("constant", 10)]

>>> for strategy, constant in predictors:
dumdum = dummy.DummyRegressor(strategy=strategy,
constant=constant)

>>> dumdum.fit(X, y)

>>> print "strategy: {}".format(strategy), ",".join(map(str,
dumdum.predict(X)[:5]))

strategy: mean 2.23297906733,2.23297906733,2.23297906733,2.23297906733,2 .23297906733
strategy: median 20.38535248,20.38535248,20.38535248,20.38535248,20.38535 248
strategy: constant 10.0,10.0,10.0,10.0,10.0 

>>> predictors = [("constant", 0),
("stratified", None),
("uniform", None),
("most_frequent", None)] 

>>> X, y = make_classification()
>>> for strategy, constant in predictors:
dumdum = dummy.DummyClassifier(strategy=strategy,
constant=constant)
dumdum.fit(X, y)
print "strategy: {}".format(strategy), ",".join(map(str,
dumdum.predict(X)[:5]))
strategy: constant 0,0,0,0,0
strategy: stratified 1,0,0,1,0
strategy: uniform 0,0,0,1,1
strategy: most_frequent 1,1,1,1,1

### 工作原理

>>> X, y = make_classification(20000, weights=[.95, .05])

>>> dumdum = dummy.DummyClassifier(strategy='most_frequent')

>>> dumdum.fit(X, y)
DummyClassifier(constant=None, random_state=None, strategy='most_ frequent')

>>> from sklearn.metrics import accuracy_score

>>> print accuracy_score(y, dumdum.predict(X))

0.94575

## 5.8 回归模型评估

### 准备

m = 2
b = 1
y = lambda x: m * x + b

>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> from sklearn import metrics

### 操作步骤

1. 使用y来生成y_actual

2. 使用y_actual加上一些err生成y_prediction'

3. 绘制差异

4. 遍历不同的度量并绘制它们

>>> def data(x, m=2, b=1, e=None, s=10):
"""
Args:
x: The x value
m: Slope
b: Intercept
e: Error, optional, True will give random error
"""

if e is None:
e_i = 0
elif e is True:
e_i = np.random.normal(0, s, len(xs))
else:
e_i = e

return x * m + b + e_i 

>>> from functools import partial

>>> N = 100
>>> xs = np.sort(np.random.rand(N)*100)

>>> y_pred_gen = partial(data, x=xs, e=True)
>>> y_true_gen = partial(data, x=xs)

>>> y_pred = y_pred_gen()
>>> y_true = y_true_gen()

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.set_title("Plotting the fit vs the underlying process.")
>>> ax.scatter(xs, y_pred, label=r'$\hat{y}$')

>>> ax.plot(xs, y_true, label=r'$y$')

>>> ax.legend(loc='best') 

>>> e_hat = y_pred - y_true

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.set_title("Residuals")
>>> ax.hist(e_hat, color='r', alpha=.5, histtype='stepfilled')

### 工作原理

MSE(y_trus, y_pred) = E((y_trus - y_pred)^2)
mse = ((y_trus - y_pred) ** 2).mean()

>>> metrics.mean_squared_error(y_true, y_pred)

93.342352628475368 

MAD(y_trus, y_pred) = E(|y_trus - y_pred|)
mad = np.abs(y_trus - y_pred).mean()

rsq = 1 - ((y_trus - y_pred) ** 2).sum() / ((y_trus - y_trus.mean()) ** 2).sum()
>>> metrics.r2_score(y_true, y_pred)

0.9729312117010761

R 平方是描述性的，它不提供模型准确性的清晰感觉。

## 5.9 特征选取

### 准备

>>> from sklearn import datasets
>>> X, y = datasets.make_regression(1000, 10000)

### 操作步骤

>>> from sklearn import feature_selection
>>> f, p = feature_selection.f_regression(X, y) 

>>> f[:5]
array([  1.06271357e-03, 2.91136869e+00, 1.01886922e+00,
2.22483130e+00, 4.67624756e-01])
>>> p[:5]
array([ 0.97400066, 0.08826831, 0.31303204, 0.1361235, 0.49424067]) 

>>> import numpy as np
>>> idx = np.arange(0, X.shape[1])
>>> features_to_keep = idx[p < .05]
>>> len(features_to_keep)
501

>>> var_threshold = feature_selection.VarianceThreshold(np.median(np.
var(X, axis=1)))

>>> var_threshold.fit_transform(X).shape

(1000, 4835)

### 工作原理

>>> X, y = datasets.make_regression(10000, 20)

>>> f, p = feature_selection.f_regression(X, y)

>>> from matplotlib import pyplot as plt

>>> f, ax = plt.subplots(figsize=(7, 5))

>>> ax.bar(np.arange(20), p, color='k')
>>> ax.set_title("Feature p values")

## 5.10 L1 范数上的特征选取

### 准备

1. 加载数据集

2. 训练基本的线性回归模型

3. 使用特征选取来移除不提供信息的特征

4. 重新训练线性回归，并与特征完整的模型相比，它拟合得多好。

### 操作步骤

>>> import sklearn.datasets as ds
>>> diabetes = ds.load_diabetes()

>>> from sklearn import metrics
>>> from sklearn import cross_validation

>>> shuff = cross_validation.ShuffleSplit(diabetes.target.size

>>> mses = []
>>> for train, test in shuff:
train_X = diabetes.data[train]
train_y = diabetes.target[train]

test_X = diabetes.data[~train]
test_y = diabetes.target[~train]

lr.fit(train_X, train_y)
mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))

>>> np.mean(mses)
2856.366626198198

>>> from sklearn import feature_selection
>>> from sklearn import cross_validation

>>> cv = linear_model.LassoCV()
>>> cv.fit(diabetes.data, diabetes.target)
>>> cv.coef_

array([ -0. , -226.2375274 ,  526.85738059,  314.44026013,
-196.92164002, 1.48742026, -151.78054083, 106.52846989,
530.58541123, 64.50588257])

>>> import numpy as np
>>> columns = np.arange(diabetes.data.shape[1])[cv.coef_ != 0]
>>> columns array([1, 2, 3 4, 5, 6, 7, 8, 9])

>>> l1mses = []
>>> for train, test in shuff:
train_X = diabetes.data[train][:, columns]
train_y = diabetes.target[train]

test_X = diabetes.data[~train][:, columns]
test_y = diabetes.target[~train]

lr.fit(train_X, train_y)           l1mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))

>>> np.mean(l1mses)
2861.0763924492171
>>> np.mean(l1mses) - np.mean(mses)
4.7097662510191185

### 工作原理

>>> X, y = ds.make_regression(noise=5)

>>> mses = []

>>> shuff = cross_validation.ShuffleSplit(y.size)

>>> for train, test in shuff:
train_X = X[train]
train_y = y[train]

test_X = X[~train]
test_y = y[~train]

lr.fit(train_X, train_y)           mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))

>>> np.mean(mses)
879.75447864034209 

>>> cv.fit(X, y)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001,
fit_intercept=True, max_iter=1000, n_alphas=100,
n_jobs=1, normalize=False, positive=False, precompute='auto',
tol=0.0001, verbose=False) 

>>> import numpy as np
>>> columns = np.arange(X.shape[1])[cv.coef_ != 0]
>>> columns[:5] array([11, 15, 17, 20, 21,])
>>> mses = []

>>> shuff = cross_validation.ShuffleSplit(y.size)
>>> for train, test in shuff:
train_X = X[train][:, columns]
train_y = y[train]

test_X = X[~train][:, columns]
test_y = y[~train]

lr.fit(train_X, train_y)
mses.append(metrics.mean_squared_error(test_y,
lr.predict(test_X)))

>>> np.mean(mses)
15.755403220117708 

## 5.11 使用 joblib 保存模型

### 准备

1. 训练我们要保存的模型

2. 导入 joblib 并保存模型

### 操作步骤

>>> from sklearn import datasets, tree
>>> X, y = datasets.make_classification()

>>> dt = tree.DecisionTreeClassifier()
>>> dt.fit(X, y)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
max_depth=None, max_features=None,
max_leaf_nodes=None, min_density=None,
min_samples_leaf=1, min_samples_split=2,
random_state=None, splitter='best')

>>> from sklearn.externals import joblib
>>> joblib.dump(dt, "dtree.clf")
['dtree.clf',
'dtree.clf_01.npy',
'dtree.clf_02.npy',
'dtree.clf_03.npy',
'dtree.clf_04.npy']

### 更多

>>> from sklearn import ensemble

>>> rf = ensemble.RandomForestClassifier()
>>> rf.fit(X, y)

RandomForestClassifier(bootstrap=True, compute_importances=None,
criterion='gini', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_density=None, min_samples_leaf=1,
min_samples_split=2, n_estimators=10,
n_jobs=1, oob_score=False,
random_state=None, verbose=0)

>>> joblib.dump(rf, "rf.clf")
['rf.clf',
'rf.clf_01.npy',
'rf.clf_02.npy',
'rf.clf_03.npy',
'rf.clf_04.npy',
'rf.clf_05.npy',
'rf.clf_06.npy',
...             ]