1
2
library(mlr3verse)
library(tidyverse)

1、Task训练数据与目的

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
## 分类任务
task_classif = as_task_classif(data, target = "col_target")
#根据预测结果又可分为:twoclass二分类, multiclass多分类

## 回归任务
task_regr = as_task_regr(data, target = "col_target")

task$ncol
task$nrow
task$feature_names
task$feature_types
task$target_names
task$task_type
task$data()
task$col_roles

2、Learner 机器学习算法

  • mlr3learners包提供了基本的机器学习算法(如下图) https://github.com/mlr-org/mlr3learners

  • mlr3extralearners包在上述基本算法之上提供更加全面的机器学习算法https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html

1
2
3
4
mlr_learners$keys()
ALL_mlr_lrn = as.data.table(mlr_learners)
dim(ALL_mlr_lrn)
# [1] 135   7

2.1 分类算法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
## mlr3extralearners包所支持的全部分类机器学习算法
mlr_learners$keys() %>% grep("classif", . , value=TRUE)
#  [1] "classif.AdaBoostM1"   "classif.bart"         "classif.C50"          "classif.catboost"    
#  [5] "classif.cforest"      "classif.ctree"        "classif.cv_glmnet"    "classif.debug"       
#  [9] "classif.earth"        "classif.featureless"  "classif.fnn"          "classif.gam"         
# [13] "classif.gamboost"     "classif.gausspr"      "classif.gbm"          "classif.glmboost"    
# [17] "classif.glmnet"       "classif.IBk"          "classif.J48"          "classif.JRip"        
# [21] "classif.kknn"         "classif.ksvm"         "classif.lda"          "classif.liblinear"   
# [25] "classif.lightgbm"     "classif.LMT"          "classif.log_reg"      "classif.lssvm"       
# [29] "classif.mob"          "classif.multinom"     "classif.naive_bayes"  "classif.nnet"        
# [33] "classif.OneR"         "classif.PART"         "classif.qda"          "classif.randomForest"
# [37] "classif.ranger"       "classif.rfsrc"        "classif.rpart"        "classif.svm"         
# [41] "classif.xgboost"
image-20220626084002138

2.2 回归算法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
## mlr3extralearners包所支持的全部回归机器学习算法
mlr_learners$keys() %>% grep("regr", . , value=TRUE)
# [1] "regr.bart"         "regr.catboost"     "regr.cforest"      "regr.ctree"       
# [5] "regr.cubist"       "regr.cv_glmnet"    "regr.debug"        "regr.earth"       
# [9] "regr.featureless"  "regr.fnn"          "regr.gam"          "regr.gamboost"    
# [13] "regr.gausspr"      "regr.gbm"          "regr.glm"          "regr.glmboost"    
# [17] "regr.glmnet"       "regr.IBk"          "regr.kknn"         "regr.km"          
# [21] "regr.ksvm"         "regr.liblinear"    "regr.lightgbm"     "regr.lm"          
# [25] "regr.lmer"         "regr.M5Rules"      "regr.mars"         "regr.mob"         
# [29] "regr.randomForest" "regr.ranger"       "regr.rfsrc"        "regr.rpart"       
# [33] "regr.rvm"          "regr.svm"          "regr.xgboost" 
image-20220626084329508

2.3 算法超参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
learner = lrn("classif.kknn")
learner
# <LearnerClassifKKNN:classif.kknn>
#   * Model: -
#   * Parameters: k=7
# * Packages: mlr3, mlr3learners, kknn
# * Predict Type: response
# * Feature types: logical, integer, numeric, factor, ordered
# * Properties: multiclass, twoclass

## 对于分类算法的预测结果默认的Predict Type为response
## 设置为prob,则会返回属于每一类别的概率
#  learner = lrn("classif.kknn", predict_type = "prob")


#算法的超参数
learner$param_set
# <ParamSet>
#             id    class lower upper nlevels default value
# 1:           k ParamInt     1   Inf     Inf       7     7
# 2:    distance ParamDbl     0   Inf     Inf       2      
# 3:      kernel ParamFct    NA    NA      10 optimal      
# 4:       scale ParamLgl    NA    NA       2    TRUE      
# 5:     ykernel ParamUty    NA    NA     Inf              
# 6: store_model ParamLgl    NA    NA       2   FALSE    

#修改设置超参数
learner$param_set$values
# $k
# [1] 7

learner$param_set$values$k = 3

3、交叉验证评价模型

1
2
3
4
ALL_mlr_resample = as.data.table(mlr_resamplings)
mlr_resamplings$keys()
# [1] "bootstrap"   "custom"      "custom_cv"   "cv"          "holdout"     "insample"   
# [7] "loo"         "repeated_cv" "subsampling"

3.1 评价指标

mlr3包提供了评价分类、回归模型的多种评价指标。

其中对于分类模型,根据是多分类还是二分类、预测结果形式是response还是prop有不同的评价指标,注意区分。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
mlr_measures$keys()
ALL_mlr_msr = as.data.table(mlr_measures)
head(ALL_mlr_msr)
#               key                          label task_type          packages predict_type task_properties
# 1:            aic   Akaika Information Criterion      <NA>              mlr3     response                
# 2:            bic Bayesian Information Criterion      <NA>              mlr3     response                
# 3:    classif.acc        Classification Accuracy   classif mlr3,mlr3measures     response                
# 4:    classif.auc       Area Under the ROC Curve   classif mlr3,mlr3measures         prob        twoclass
# 5:   classif.bacc              Balanced Accuracy   classif mlr3,mlr3measures     response                
# 6: classif.bbrier             Binary Brier Score   classif mlr3,mlr3measures         prob        twoclass

ALL_mlr_msr %>% 
  dplyr::filter(task_type=="classif",
                predict_type=="response",
                task_properties!="twoclass") %>% .[,c(1,2)]
#              key                         label
# 1:   classif.acc       Classification Accuracy
# 2:  classif.bacc             Balanced Accuracy
# 3:    classif.ce          Classification Error
# 4: classif.costs Cost-sensitive Classification

3.2 留出法交叉验证

将数据分为训练集与测试集。首先在训练集训练模型参数,然后在测试集测试模型性能。

(1)手动拆分数据集

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
task = as_task_classif(diabetes, target = "class")
split = partition(task, ratio = 0.6, stratify = T)
str(split)
# List of 2
# $ train: int [1:88] 2 3 5 6 7 9 10 11 12 13 ...
# $ test : int [1:57] 1 4 8 17 19 20 21 22 23 24 ...

learner = lrn("classif.kknn")
#先用训练集训练模型
learner$train(task, row_ids = split$train)
#再在测试集评价模型
prediction = learner$predict(task, row_ids = split$test)
as.data.table(prediction) %>% head
#    row_ids  truth response
# 1:       2 Normal   Normal
# 2:       5 Normal   Normal
# 3:       7 Normal   Normal

#指标评价
prediction$score(msr("classif.acc"))

(2)使用holdout验证方法(推荐)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
task$col_roles$stratum = "class"
#对于交叉验证,分训练集与测试集时根据 class列进行分层抽样
resampling = rsmp("holdout")
print(resampling)
resampling$param_set$values
# $ratio
# [1] 0.6666667
# 默认2/3 训练集 1/3测试集

# split = resampling$instantiate(task)
# str(split$instance)
# # List of 2
# #  $ train: int [1:97] 1 3 4 5 6 7 12 14 15 17 ...
# #  $ test : int [1:48] 2 8 9 10 11 13 16 21 23 28 ...

rr = resample(task, learner, resampling)
str(rr)
# Classes 'ResampleResult', 'R6' <ResampleResult> 
rr$prediction()
# <PredictionClassif> for 48 observations:
## 对于测试集的预测结果
rr$prediction() %>% as.data.table() %>% head
#    row_ids  truth response
# 1:       2 Normal   Normal
# 2:       8 Normal   Normal
# 3:       9 Normal   Normal

rr$score(msr("classif.acc"))[,c(-1,-3,-5,-8)]
#     task_id   learner_id resampling_id iteration classif.acc
# 1: diabetes classif.kknn       holdout         1   0.8541667

rr$score(msrs(c("classif.acc","classif.bacc","classif.ce")))[,c(-1,-3,-5,-8)]
#     task_id   learner_id resampling_id iteration classif.acc classif.bacc classif.ce
# 1: diabetes classif.kknn       holdout         1   0.8541667    0.8397436  0.1458333

3.3 K折交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
resampling = rsmp("cv")
print(resampling)
resampling$param_set$values
# $folds
# [1] 10
# 默认10折交叉验证

rr = resample(task, learner, resampling)
str(rr)
# Classes 'ResampleResult', 'R6' <ResampleResult> 
rr$prediction()
# <PredictionClassif> for 145 observations:
## 对于测试集的预测结果
rr$prediction() %>% as.data.table() %>% head

##性能评价
rr$score(msr("classif.acc"))[,c(-1,-3,-5,-8)]
#     task_id   learner_id resampling_id iteration classif.acc
#  1: diabetes classif.kknn            cv         1   1.0000000
#  2: diabetes classif.kknn            cv         2   0.9333333
#  3: diabetes classif.kknn            cv         3   0.8000000
#  4: diabetes classif.kknn            cv         4   0.8000000
#  5: diabetes classif.kknn            cv         5   0.8666667
#  6: diabetes classif.kknn            cv         6   0.9285714
#  7: diabetes classif.kknn            cv         7   0.9285714
#  8: diabetes classif.kknn            cv         8   0.9285714
#  9: diabetes classif.kknn            cv         9   0.9285714
# 10: diabetes classif.kknn            cv        10   0.9285714

rr$aggregate(msr("classif.acc"))
# classif.acc 
#   0.9042857
rr$aggregate(msrs(c("classif.acc","classif.bacc","classif.ce")))
# classif.acc classif.bacc   classif.ce 
#   0.90428571   0.88711520   0.09571429

4、超参数优化

每种机器学习算法都有特定含义的超参数。

超参数优化是指寻找一个超参数使得模型的性能相对来说由于其它超参数。

首先需要选择候选超参数,其次需要定义模型性能的评价方式与指标。

4.1 简单流程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
task = as_task_classif(diabetes, target = "class")
learner = lrn("classif.kknn")

##定义候选超参数范围--3、4、5、6、7、8、9、10
search_space <- ps(
  k = p_int(lower = 3, upper = 10)
)
##定义模型评价方式--10折交叉验证
resampling = rsmp("cv")
##定义模型评价指标--分类错误率
measure = msr("classif.ce")
##根据可用的预算提前终止训练
terminator = trm("none")

instance = TuningInstanceSingleCrit$new(
  task = task,
  learner = learner,
  resampling = resampling,
  measure = measure,
  terminator = trm("none"),
  search_space = search_space
)

tuner = tnr("grid_search")       #默认resolution=10
tuner$optimize(instance)
instance$archive %>% as.data.table() %>% .[,c(1,2)]
#     k classif.ce
# 1:  4 0.10333333
# 2:  6 0.09619048
# 3:  9 0.08952381
# 4:  3 0.08952381
# 5: 10 0.10285714
# 6:  5 0.10333333
# 7:  8 0.08952381
# 8:  7 0.09619048

##最优超参数
instance$result_learner_param_vals
# $k
# [1] 8
##最优超参数的模型指标
instance$result_y
# classif.ce 
# 0.07619048 

4.2 设置超参数空间

  • 支持4种基本超参数类型
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 整型
p_int(lower=1,upper=10)
# 小数
p_dbl(lower=0.01,upper=1)
# 字符类型
p_fct(c("a", "b", "c"))
# 逻辑值
p_lgl()

search_space <- ps(
  hp1 = p_int(lower=1,upper=10),
  hp2 = p_dbl(lower=0.01,upper=1),
  hp3 = p_fct(c("a", "b", "c"))
)
# <ParamSet>
#     id    class lower upper nlevels        default value
# 1: hp1 ParamInt  1.00    10      10 <NoDefault[3]>      
# 2: hp2 ParamDbl  0.01     1     Inf <NoDefault[3]>      
# 3: hp3 ParamFct    NA    NA       3 <NoDefault[3]>

如上,当设置小数类型的超参数时,默认有无限种可能。因此需要resolution参数限定。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# (1)resolution 默认以上下边界值的n个等距值
# 3*3*3 = 27
generate_design_grid(search_space, resolution = 3)
# <Design> with 27 rows:
#     hp1   hp2 hp3
#  1:   1 0.010   a
#  2:   1 0.010   b
#  3:   1 0.010   c

# (2)param_resolutions为不同的超参数设置不同的resolution
generate_design_grid(search_space, 
					 param_resolutions = c(hp1 = 5, hp2 = 3))
# <Design> with 45 rows:
#     hp1   hp2 hp3
#  1:   1 0.010   a
#  2:   1 0.010   b
#  3:   1 0.010   c

# (3) trafo定义不等间距的超参数范围
search_space = ps(
  hp1 = p_int(-1,2, trafo = function(x) 10^x),
  hp2 = p_int(1,5, trafo = function(x) 2^x)
)
data.table::rbindlist(generate_design_grid(search_space, 
					 param_resolutions = c(hp1 = 4, hp2 = 5))$transpose())
 #      hp1 hp2
 # 1:   0.1   2
 # 2:   0.1   4
 # 3:   0.1   8
 # 4:   0.1  16
 # 5:   0.1  32

4.3 设置遍历方式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
as.data.table(mlr_tuners)[,c(1,2)]
#              key                                           label
# 1:         cmaes Covariance Matrix Adaptation Evolution Strategy
# 2: design_points                                   Design Points
# 3:         gensa                 Generalized Simulated Annealing
# 4:   grid_search                                     Grid Search
# 5:         irace                                 Iterated Racing
# 6:        nloptr                         Non-linear Optimization
# 7: random_search                                   Random Search

## grid_search  会将每一种超参数组合计算一遍
## design_points仅遍历自定义的超参数组合范围(推荐)

##(1) 上面提到的resolution方式
search_space = ps(
  k = p_int(lower = 3, upper = 50)
)
design = generate_design_grid(search_space, resolution = 5)
design$data
#     k
# 1:  3
# 2: 14
# 3: 26
# 4: 38
# 5: 50
tuner = tnr("design_points", design = design$data)
tuner$optimize(instance)
# 需要注意design的候选超参数必须包括在instance的search_space范围内

##(2) 完全自定义的获选超参数
design = data.frame(k=c(3,5,7,9)) %>% as.data.table()
tuner = tnr("design_points", design = design)  
tuner$optimize(instance)

4.4 关于terminator

当提供的候选超参数组合过多,占据太多计算资源时,设置TuningInstanceSingleCrit$new()terminator参数提前种终止遍历。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
as.data.table(mlr_terminators)
#                 key                label             properties        unit
# 1:       clock_time           Clock Time single-crit,multi-crit     seconds
# 2:            combo          Combination single-crit,multi-crit     percent
# 3:            evals Number of Evaluation single-crit,multi-crit evaluations
# 4:             none                 None single-crit,multi-crit     percent
# 5:     perf_reached    Performance Level            single-crit     percent
# 6:         run_time             Run Time single-crit,multi-crit     seconds
# 7:       stagnation           Stagnation            single-crit     percent
# 8: stagnation_batch     Stagnation Batch            single-crit     percent

# clock_time--在给定时间后终止
# evals--在给定的迭代量之后终止
# perf_reached--在达到特定性能后终止
# stagnation--当优化没有改善时终止
# none--不设置终止条件

5、训练最终模型与预测新数据

根据超参数优化结果,基于全部数据,选用最佳的超参数训练得到最终的机器学习模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
task = as_task_classif(diabetes, target = "class")
learner = lrn("classif.kknn")
learner$param_set$values$k = 8
learner$train(task)
#查看模型
learner$model

#使用模型预测新数据
new_data = data.frame(glucose=c(200,300),
					  insulin=c(500,1000),
					  sspg=c(100,50))
new_data
#   glucose insulin sspg
# 1     200     500  100
# 2     300    1000   50
learner$predict_newdata(new_data)
# <PredictionClassif> for 2 observations:
#  row_ids truth response
#        1  <NA>    Overt
#        2  <NA>    Overt

# 保存模型
saveRDS(learner, 'diabetes_knn.rds')
model = readRDS('diabetes_knn.rds')
model$predict_newdata(new_data)