机器学习基于R(0)--mlr3基本流程 V2

https://mlr3book.mlr-org.com /

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


library(mlr3verse)
library(tidyverse)

tsks() #预置数据任务

lrns() #机器学习算法

msrs() #性能评价指标

as.data.table()

1. Task 任务

https://mlr3book.mlr-org.com/chapters/chapter2/data_and_basic_modeling.html

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


tsk()

#预置数据任务
as.data.table(tsk()) 
tsk("mtcars")

#自定义任务
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
#target参数指定标签列，id参数（可选）设置任务名
as_task_classif()

#支持对任务对象进行数据查看、修改等操作，不一一列举，详见上述链接
#有两点需要重点说明
tsk_mtcars$row_ids #不等于一般的行序号。一旦定义任务，row_ids就确定不变了，可以理解为row name。方便后续数据分割。

tsk_mtcars_another = tsk_mtcars$clone() #想要独立的复制任务时，需要使用clone()

对于分类任务基本类似。值得注意的是在二分类问题时，需要进一步指定阳性标签
1
2
3
4
5
data(Sonar, package = "mlbench")
# specifying the positive class:
tsk_classif = as_task_classif(Sonar, target = "Class", positive = "R")
tsk_classif$positive 
tsk_classif$positive = "M" #修改

2. Learner 算法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


# mlr3learners包支持~10种常见的机器学习算法，建议在建模时优先尝试
# mlr3extralearners包支持了更多种(>100)算法【需要单独安装】

# 0) 查看所有算法简介（或者点击上述链接）
learners_dt = as.data.table(lrns())
learners_dt
learners_dt %>% 
  dplyr::filter(grepl("mlr3learners",packages)) %>% 
  dplyr::filter(task_type=="classif") 

# 1) 定义并使用算法
lrn_rpart = lrn("regr.rpart") 
#在分类问题，可额外设置 predict_type = "prob"/"response" 表示预测概率值/类别结果

# 2) 简单分割数据为训练集与测试集
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
splits = partition(tsk_mtcars)
str(splits)

# 3) 训练、查看模型
lrn_rpart$train(tsk_mtcars, row_ids = splits$train)
lrn_rpart$model

# 4) 模型预测
prediction = lrn_rpart$predict(tsk_mtcars, row_ids = splits$test) #测试集
prediction2 = lrn_rpart$predict_newdata(mtcars_new) #新数据

关于模型的超参数：

上述训练示例，使用了算法的默认超参数；
也可以在训练前自定义设置一组超参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


lrn_rpart = lrn("regr.rpart")
lrn_rpart$param_set #查看支持的超参数类型
lrn_rpart$param_set$values #查看当前设置的超参数值

#设置超参数方法
lrn_rpart = lrn("regr.rpart", maxdepth = 1)
#or
lrn_rpart$param_set$values$maxdepth = 2
#or
lrn_rpart$param_set$set_values(xval = 2, cp = 0.5)

3. Measure 指标

https://mlr3book.mlr-org.com/chapters/chapter2/data_and_basic_modeling.html#sec-eval

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


# mlr3measures包提供了常见分类或回归任务的评价指标

# 0) 查看所有指标
measures_dt = as.data.table(msrs())
measures_dt %>% 
  dplyr::filter(grepl("mlr3measures",packages)) %>% 
  dplyr::filter(task_type=="classif") 


# 1)前景提要
lrn_rpart = lrn("regr.rpart")
tsk_mtcars = tsk("mtcars")
splits = partition(tsk_mtcars)
lrn_rpart$train(tsk_mtcars, splits$train)
prediction = lrn_rpart$predict(tsk_mtcars, splits$test)

# 2)选择指标
measure = msr("regr.mae")
measure

# 3)指标评价
prediction$score(measure)

# 4)支持同时多个指标。注意，此时需要使用msrs(),而不是msr()
measures = msrs(c("regr.mse", "regr.mae"))
prediction$score(measures)

Tip：对于分类任务
相关指标，有的是适用于"prob"（概率值），有的是适用于"response"（标签结果）。
在评价前，可以修改默认的分类阈值，并查看相关的混淆矩阵
1
2
prediction$set_threshold(0.7)
prediction$confusion

4. Resampling 交叉验证

https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-resampling

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


# 查看所支持的交叉验证方式，详细介绍参考上述链接
resampling_dt = as.data.table(rsmps())

# 1) 定义交叉验证方式
# three-fold CV
cv3 = rsmp("cv", folds = 3)
# 2-repeats 5-fold CV
rcv25 = rsmp("repeated_cv", repeats = 2, folds = 5)

# 2) 定义任务/算法
tsk_mtcars = tsk("mtcars")
lrn_rpart = lrn("regr.rpart")

# 3) 执行交叉验证
rr = resample(tsk_mtcars, lrn_rpart, cv3)
rr

rr$score(msr("regr.mse"))     #计算每折的指标结果
rr$aggregate(msr("regr.mse")) #计算综合的评价结果

rr$predictions()  #查看每折对验证集的预测结果
rr$prediction()   #查看全部对验证集的预测结果

Tips:
在执行交叉验证前，可提前查看每折的样本分割方式
1
2
3
4
cv3$instantiate(tsk_mtcars)
# 查看第一次拆分的训练集与测试集id
cv3$train_set(1)
cv3$test_set(1)
在resample()分析结果默认不保存每折的训练模型，可通过设置参数进行保存
1
2
rr = resample(tsk_mtcars, lrn_rpart, cv3, store_models = T)
rr$learners[[1]]$model

特殊CV：由于数据样本的特殊性，在样本分割时可进行额外的设置

Grouped Resampling：保证同一组的数据不会既在训练集，也在测试集中；可参考如下图进行理解

https://mlr3book.mlr-org.com/chapters/chapter3/Figures/mlr3book_figures-7.svg

1
2
3
4
5
6
7


# 如下将penguins任务的year列设置为group的依据
tsk_grp = tsk("penguins")
tsk_grp$set_col_roles("year", "group")

rsmp_loo = rsmp("loo") #使用leave-one-out方法，每次留一组的样本作为测试集
rsmp_loo$instantiate(tsk_grp)
table(tsk_grp$data(rows = rsmp_loo$train_set(1), cols = "year"))

Stratified Sampling：保证每次样本拆分时，某一特征的分布保持一致；可参考下图进行理解

https://mlr3book.mlr-org.com/chapters/chapter3/Figures/mlr3book_figures-8.svg

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


tsk_str = tsk("penguins")
# set species to have both the 'target' and 'stratum' column role
tsk_str$set_col_roles("species", c("target", "stratum"))
cv3$instantiate(tsk_str)

fold1 = prop.table(table(tsk_str$data(rows = cv3$test_set(1),
                                      cols = "species")))
fold2 = prop.table(table(tsk_str$data(rows = cv3$test_set(2),
                                      cols = "species")))
rbind("Fold 1" = fold1, "Fold 2" = fold2)

5. Benchmark 基准

https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-benchmarking

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


# 1) 设计基准测试任务
# 不同的数据
tasks = list(tsk("penguins"), tsk("sonar"))
# 不同的算法
learners = list(lrn("classif.featureless"), lrn("classif.rpart"))
# 不同的交叉验证
resamplings = list(rsmp("cv"), rsmp("subsampling"))

# Set a seed to ensure reproducibility of the resampling instantiation
set.seed(123)
design = benchmark_grid(tasks, learners, resamplings)
design

# 2) 执行
bmr = benchmark(design)
bmr

bmr$score(msr(c("classif.acc")))
bmr$aggregate(msr(c("classif.acc")))

6. HPO超参数优化

https://mlr3book.mlr-org.com/chapters/chapter4/hyperparameter_optimization.html

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


# 1) 设置算法超参数空间
# 查看支持的超参数类型
lrn("classif.svm")$param_set
# 使用to_tune()设置候选范围
learner = lrn("classif.svm",
              type  = "C-classification",
              kernel = to_tune(c("radial", "linear")),   #字符向量设置候选范围
              cost  = to_tune(1e-1, 1e5, logscale = T),  #对于指数分布范围，进行log转换，合理采样
              gamma = to_tune(c(0.1, 0.2, 0.5, 1))       #对于数值类型，也可以指定候选值
)
learner


# 2) 停止搜索的条件,参考链接详细介绍 https://mlr-org.com/terminators.html
as.data.table(trm())
# 例如
trm("none")                  #不做限制
trm("evals", n_evals = 5)    #只搜索5次
trm("run_time", secs = 1800) #运行时间

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


# 3) 创建一个HPO实例
tsk_sonar = tsk("sonar") 

learner = lrn("classif.svm",
  cost  = to_tune(1e-1, 1e5, logscale = T),
  gamma = to_tune(1e-1, 1),
  kernel = "radial",
  type = "C-classification"
)

instance = ti(
  task = tsk_sonar,                        #数据任务
  learner = learner,                       #学习算法（超参数空间）
  resampling = rsmp("cv", folds = 3),      #交叉验证
  measures = msr("classif.ce"),            #评价指标
  terminator = trm("none")                 #停止条件
)

instance

Tips:

也可以ps()设置超参数空间，提供给ti()的search_space参数，详细参看https://mlr3book.mlr-org.com/chapters/chapter4/hyperparameter_optimization.html#sec-tune-ps

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


search_space = ps(
  cost  = p_dbl(lower = 1e-1, upper = 1e5, trafo = function(x) exp(x)),
  gamma = p_dbl(lower=0.1, upper = 1)
)

instance = ti(
  task = tsk_sonar,                        
  learner = lrn("classif.svm",kernel = "radial",type = "C-classification"),                      
  resampling = rsmp("cv", folds = 3),     
  measures = msr("classif.ce"),            
  terminator = trm("none"),
  search_space = search_space
)

mlr3团队基于以往研究，收集了常见机器学习模型的常用超参数设置，可供用户直接使用

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


library(mlr3tuningspaces)
as.data.table(mlr_tuning_spaces)

lts_svm = lts("classif.svm.default")
lts_svm

instance = ti(
  task = tsk_sonar,                        
  learner = lrn("classif.svm",type = "C-classification"),                      
  resampling = rsmp("cv", folds = 3),     
  measures = msr("classif.ce"),            
  terminator = trm("none"),
  search_space = lts_svm
)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


# 4) 定义超参数搜索方法，参看链接说明 https://mlr-org.com/tuners.html
tnr()
as.data.table(tnr())
# 例如
tnr("random_search")  #随机搜索
tnr("grid_search")    #笛卡尔积
tnr("mbo")            #贝叶斯优化
tnr("design_points", design = design)   #自定义搜索范围
# 选择一种方法对上述示例进行调参
tuner = tnr("grid_search", resolution = 5, batch_size = 10) #5*5
tuner = tnr("grid_search", param_resolutions = c(cost = 5, gamma = 3), batch_size = 10) #3*5
# resolution表示对连续型超参数范围，取n个值
# batch_size可以简单理解为线程数
tuner
tuner$optimize(instance)

instance$result$learner_param_vals #查看最优模型对应的超参数
instance$archive                   #查看所有遍历的训练结果


# 5) 使用最佳超参数建立最终模型
lrn_svm_tuned = lrn("classif.svm")
lrn_svm_tuned$param_set$values = instance$result_learner_param_vals
lrn_svm_tuned$train(tsk_sonar)$model   #所有数据建模

嵌套交叉验证（Nested-CV）：Nested resampling is a method to compare models and to estimate the generalization performance of a tuned model, however, this is the performance based on multiple different configurations (one from each outer fold) and not performance based on a single configuration.

其直观、清晰的步骤解释可参看这张图的介绍：https://mlr3book.mlr-org.com/chapters/chapter4/Figures/mlr3book_figures-11.svg

首先，需要介绍auto_tuner()。它将HPO过程包装为一个类似learner的对象

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


at = auto_tuner(
  tuner = tnr("grid_search", resolution = 3, batch_size = 5),
  learner = lrn("classif.svm",type = "C-classification"),
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.ce"),
  search_space = lts("classif.svm.default")
)
at

split = partition(tsk_sonar)
at$train(tsk_sonar, row_ids = split$train)
#首先通过训练集交叉验证找到一组最优超参数
#然后使用该超参数在全部训练集数据建模

at$predict(tsk_sonar, row_ids = split$test)$score() #测试集评价

然后，Nested CV本质可以理解为对上述示例，再次进行外部的交叉验证

1
2
3
4
5
6
7
8
9


rr = resample(tsk_sonar, at, rsmp("cv", folds = 2), store_models = TRUE)

#查看外部交叉验证的结果
rr$score(msr("classif.ce"))
rr$aggregate(msr("classif.ce"))

#查看内部交叉验证的结果
extract_inner_tuning_results(rr)
extract_inner_tuning_archives(rr)

7. FS 特征选择

https://mlr3book.mlr-org.com/chapters/chapter6/feature_selection.html

思路1：基于mlr3filters包，对每个特征进行单独打分，再设置阈值筛选

所有支持的打分方法有：https://mlr3filters.mlr-org.com/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


# 1) 定义任务
tsk_pen = tsk("penguins")
# 2) 选择打分指标
flt_gain = flt("information_gain")
# flt_cor = flt("correlation", method = "spearman")
# 3) 计算打分
flt_gain$calculate(tsk_pen)
as.data.table(flt_gain)    

# 最后设定预测（打分高于设定阈值，或者打分排名高于设定阈值），筛选预期特征集合

此外，有两个依赖于特定算法的打分指标

“importance”: If only a single filter method is to be used, the authors recommend to use a feature importance filter using random forest permutation importance

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# importance是比较推荐的一种打分，需要结合支持的算法（其中，比较推荐随机森林）                                 
as.data.table(mlr_learners)[
  sapply(properties, function(x) "importance" %in% x)]
lrn("classif.ranger")$param_set$levels$importance
# [1] "none"               "impurity"           "impurity_corrected" "permutation"   

lrn_ranger = lrn("classif.ranger", importance = "permutation")
# 由于ranger算法的Properties没有“missing”，也就是说不能处理缺失值，这里需要手动处理一下
tsk_pen = tsk("penguins")
tsk_pen$filter(tsk_pen$row_ids[complete.cases(tsk_pen$data())])
                                   
flt_importance = flt("importance", learner = lrn_ranger)
flt_importance$calculate(tsk_pen)
as.data.table(flt_importance)

“selected_features"：对于部分ML模型（例如决策树），其在建模过程中，仅会选择部分模型，可以作为特征选择的依据。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


# 查看支持的ML算法
as.data.table(mlr_learners)[
  sapply(properties, function(x) "selected_features" %in% x)]

# 示例         
tsk_pen = tsk("penguins")
lrn_rpart = lrn("classif.rpart")
flt_selected = flt("selected_features", learner = lrn_rpart)
flt_selected$calculate(tsk_pen)
as.data.table(flt_selected)

思路2：基于mlr3fselect包，直接遍历、搜索最优的特征集合（类似于HPO过程）

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


# 1) 定义实例
instance = fsi(
  task = tsk("penguins"),
  learner = lrn("classif.rpart"),
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.acc"),
  terminator = trm("evals", n_evals = 20) #停止条件
)
# 2) 选择搜索算法
# as.data.table(fs())
fselector = fs("random_search")
# fselector = fs("genetic_search")

# 3) 执行搜索
fselector$optimize(instance)

instance$result_feature_set     #搜索的最优特征集合
as.data.table(instance$archive) #搜索记录

auto_fselector() 支持基于嵌套交叉验证的特征集合选择，详见 https://mlr3book.mlr-org.com/chapters/chapter6/feature_selection.html#sec-autofselect

8. Pipeline流程

https://mlr3book.mlr-org.com/chapters/chapter7/sequential_pipelines.html
首先：PipeOp提供了很多模块化操作，对输入数据进行特定的处理，再输出。（https://mlr-org.com/pipeops.html）

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


# 实例化模块
po_pca = po("pca", center = TRUE)
po_pca

#输入数据
tsk_small = tsk("penguins_simple")$select(c("bill_depth", "bill_length"))
poin = list(tsk_small$clone()$filter(1:5))

# "模块训练"
poout = po_pca$train(poin) # poin: Task in a list
poout # list with a single element 'output'
poout$output$head()

# "模块预测"
tsk_onepenguin = tsk_small$clone()$filter(42)
poin = list(tsk_onepenguin) # list
poout = po_pca$predict(poin)
poout[[1]]$data()

Tips：模块的训练/预测数据格式都需要为list对象

然后：Graph可以将多个模块连接为一个"图”，进行整体的操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


# 1) 定义两个独立模块
po_mutate = po("mutate",
               mutation = list(bill_ratio = ~bill_length / bill_depth)
)
po_scale = po("scale")

# 2) 使用 %>>% 连接成图
graph = po_mutate %>>% po_scale
graph
graph$plot(horizontal = TRUE)
graph$pipeops

# 3) 图整体的训练/预测等操作
res_train = graph$train(tsk_small)
res_train

tsk_onepenguin = tsk_small$clone()$filter(42)
res_predict = graph$predict(tsk_onepenguin)
res_predict[[1]]$head()

最常用的是将数据预处理与建模过程进行结合，再转为GraphLearner对象

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


# 1) 将模块（含learner）连接成图
lrn_logreg = lrn("classif.log_reg")
graph = po("imputesample") %>>% lrn_logreg
# graph = po("imputesample") %>>% po("learner", lrn_logreg)
graph$plot(horizontal = TRUE)

# 2) 将Graph转为GraphLearner
glrn_sample = as_learner(graph)
glrn_mode = as_learner(po("imputemode") %>>% lrn_logreg)

# 3) 进行learner的常规操作，例如交叉验证、Benchmark等
design = benchmark_grid(tsk("pima"), list(glrn_sample, glrn_mode),
                        rsmp("cv", folds = 3))
bmr = benchmark(design)
aggr = bmr$aggregate()[, .(learner_id, classif.ce)]
aggr

Tips: https://mlr3book.mlr-org.com/chapters/chapter8/non-sequential_pipelines_and_tuning.html

（1）上述介绍的graph，多为sequential类型（一条直线）。此外也可以通过gunion，常见non-sequential类型的Pipeline。

（2）此外，也内置了许多预定义好的Pipeline（ppl()），共方便的使用。例如：

ppl("bagging", graph): 可以方便地对一种算法进行bagging集成；

ppl("robustify"): 整合了多种数据的预处理方法，从而提供适合算法的输入数据。

1. Task 任务#

2. Learner 算法#

3. Measure 指标#

4. Resampling 交叉验证#

5. Benchmark 基准#

6. HPO超参数优化#

7. FS 特征选择#

8. Pipeline流程#