1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
tree_classif = DecisionTreeClassifier()
wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))
#feats_df.shape, targets.shape
train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
# ((142, 13), (36, 13), (142, 1), (36, 1))

一、交叉验证

https://scikit-learn.org/stable/modules/cross_validation.html

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

1.1 自动进行交叉验证

  • cross_val_score : 返回一维数组,记录每一折对应的单指标结果
1
2
3
4
5
6
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_classif, train_X, train_y, 
						 scoring="accuracy", cv=10)
# array([0.93333333, 1.        , 0.85714286, 0.85714286, 0.71428571,
#        0.92857143, 0.78571429, 0.92857143, 1.        , 0.85714286])
  • cross_validate : 返回一个字典,包括训练时间与计算指标两类结果。可以计算多个指标
1
2
3
4
5
from sklearn.model_selection import cross_validate
scores = cross_validate(tree_classif, train_X, train_y, 
						scoring=["accuracy","f1_micro"], cv=10)
scores.keys()
# dict_keys(['fit_time', 'score_time', 'test_accuracy', 'test_f1_micro'])
  • 关于cv参数

    (1)如果不指定,默认为5折交叉验证。对于分类任务,使用StratifiedKFold;对于回归任务,使用KFold。

    (2)如果填写某一整数,则指定为K折交叉验证

    (3)可以是sklearn交叉验证索引生成器,参考下面1.2 KFold示例

    (4)自定义一个交叉验证索引迭代器, 参考下面1.2 KFold示例中的方式2

1.2 生成训练集与验证集索引

  • KFold : 生成k折的互斥训练集与验证集索引
1
2
3
4
5
6
7
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, test_idx in kf.split(train_X):
	print(f"train={len(train_idx)}, valid={len(test_idx)}")
# train=94, valid=48
# train=95, valid=47
# train=95, valid=47

与上述自动交叉验证相结合,等价于自定义交叉验证方式。

在后面的超参数优化中,同样可以使用类似方式。

1
2
3
4
5
6
##方式1
cross_val_score(tree_classif, train_X, train_y, 
					 scoring="accuracy", cv=kf)
##方式2
cross_val_score(tree_classif, train_X, train_y, 
					 scoring="accuracy", cv=kf.split(train_X))
  • RepeatedKFold : 重复多次生成k折的互斥训练集与验证集索引
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from sklearn.model_selection import RepeatedKFold
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=42)
for train_idx, test_idx in rkf.split(train_X):
	print(f"train={len(train_idx)}, valid={len(test_idx)}")
# train=94, valid=48
# train=95, valid=47
# train=95, valid=47
# train=94, valid=48
# train=95, valid=47
# train=95, valid=47
  • StratifiedKFold : 基于分层抽样原则,生成k折的互斥训练集与验证集索引
1
2
3
4
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, test_idx in skf.split(train_X, train_y):  #多一个参数,交代分层标准
	print(f"train={len(train_idx)}, valid={len(test_idx)}")
  • GroupKFold:样本设计时已经分为若干组,在划分训练集与验证集时,确保同一组的样本位于同一类集合内。
1
2
3
4
5
6
from sklearn.model_selection import GroupKFold
##如下表示分组信息:假设每两个样本为一组
group = np.tile(np.array(range(int(train_X.shape[0]/2))), 2).reshape((-1,1))
gkf = GroupKFold(n_splits=5)
for train_idx, test_idx in gkf.split(train_X, groups = group):  #多一个参数,交代分组标准
	print(f"train={len(train_idx)}, valid={len(test_idx)}")

二、超参数优化

https://scikit-learn.org/stable/modules/grid_search.html

2.1 网格搜索

  • GridSearchCV网格/暴力搜索,遍历所有参数组合
    • 以字典形式设置超参数候选范围
    • cv设置交叉验证,scoring模型评价指标,可以有多个
    • refit设置是否使用最佳超参数直接拟合模型;默认为True;当有多个评价指标时,需指定其中一个为组合比较标准
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.model_selection import GridSearchCV
param_grid = {"max_depth": [3, 5, 10],
              "min_samples_split": [2, 4],
              "min_samples_leaf": [1, 3]}
grid_search = GridSearchCV(tree_classif, param_grid, cv=3,
                           scoring="accuracy", refit=True, n_jobs=1)
grid_search = GridSearchCV(tree_classif, param_grid, cv=3,
                           scoring=["accuracy","f1_micro"],   #计算多个指标
                           refit="accuracy",       #使用`accuracy`比较不同组超参数
                           n_jobs=-1)              #使用全部可用线程

#遍历超参数拟合
grid_search.fit(train_X, train_y)
#所有超参数的计算结果
pd.DataFrame(grid_search.cv_results_)

#最优超参数的值与得分
grid_search.best_params_
grid_search.best_score_
#最优超参数模型
best_tree_classif = grid_search.best_estimator_
best_tree_classif.score(test_X, test_y)

grid_search_cv_df = pd.DataFrame(grid_search.cv_results_)
grid_search_cv_df.loc[:,[x.startswith(("mean","rank","params")) for x in grid_search_cv_df.columns]]

2.2 随机搜索

  • RandomizedSearchCV :除n_iter参数设置随机搜索的次数外,其余参数与上述相同
1
2
3
4
5
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(tree_classif, param_grid, 
								   n_iter = 10, cv=3,
		                           scoring=["accuracy","f1_micro"], 
		                           refit="accuracy", n_jobs=-1)