1
 2
 3
 4
 5
 6
 7
 8
 9
10
library(mlr3verse)
library(tidyverse)

tsks() #预置数据任务

lrns() #机器学习算法

msrs() #性能评价指标

as.data.table()

1. Task 任务

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
tsk()

#预置数据任务
as.data.table(tsk()) 
tsk("mtcars")

#自定义任务
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
#target参数指定标签列,id参数(可选)设置任务名
as_task_classif()

#支持对任务对象进行数据查看、修改等操作,不一一列举,详见上述链接
#有两点需要重点说明
tsk_mtcars$row_ids #不等于一般的行序号。一旦定义任务,row_ids就确定不变了,可以理解为row name。方便后续数据分割。

tsk_mtcars_another = tsk_mtcars$clone() #想要独立的复制任务时,需要使用clone()

对于分类任务基本类似。值得注意的是在二分类问题时,需要进一步指定阳性标签

1
2
3
4
5
data(Sonar, package = "mlbench")
# specifying the positive class:
tsk_classif = as_task_classif(Sonar, target = "Class", positive = "R")
tsk_classif$positive 
tsk_classif$positive = "M" #修改

2. Learner 算法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# mlr3learners包支持~10种常见的机器学习算法,建议在建模时优先尝试
# mlr3extralearners包支持了更多种(>100)算法【需要单独安装】

# 0) 查看所有算法简介(或者点击上述链接)
learners_dt = as.data.table(lrns())
learners_dt
learners_dt %>% 
  dplyr::filter(grepl("mlr3learners",packages)) %>% 
  dplyr::filter(task_type=="classif") 

# 1) 定义并使用算法
lrn_rpart = lrn("regr.rpart") 
#在分类问题,可额外设置 predict_type = "prob"/"response" 表示预测概率值/类别结果

# 2) 简单分割数据为训练集与测试集
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
splits = partition(tsk_mtcars)
str(splits)

# 3) 训练、查看模型
lrn_rpart$train(tsk_mtcars, row_ids = splits$train)
lrn_rpart$model

# 4) 模型预测
prediction = lrn_rpart$predict(tsk_mtcars, row_ids = splits$test) #测试集
prediction2 = lrn_rpart$predict_newdata(mtcars_new) #新数据

关于模型的超参数:

  • 上述训练示例,使用了算法的默认超参数;
  • 也可以在训练前自定义设置一组超参数
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
lrn_rpart = lrn("regr.rpart")
lrn_rpart$param_set #查看支持的超参数类型
lrn_rpart$param_set$values #查看当前设置的超参数值

#设置超参数方法
lrn_rpart = lrn("regr.rpart", maxdepth = 1)
#or
lrn_rpart$param_set$values$maxdepth = 2
#or
lrn_rpart$param_set$set_values(xval = 2, cp = 0.5)

3. Measure 指标

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# mlr3measures包提供了常见分类或回归任务的评价指标

# 0) 查看所有指标
measures_dt = as.data.table(msrs())
measures_dt %>% 
  dplyr::filter(grepl("mlr3measures",packages)) %>% 
  dplyr::filter(task_type=="classif") 


# 1)前景提要
lrn_rpart = lrn("regr.rpart")
tsk_mtcars = tsk("mtcars")
splits = partition(tsk_mtcars)
lrn_rpart$train(tsk_mtcars, splits$train)
prediction = lrn_rpart$predict(tsk_mtcars, splits$test)

# 2)选择指标
measure = msr("regr.mae")
measure

# 3)指标评价
prediction$score(measure)

# 4)支持同时多个指标。注意,此时需要使用msrs(),而不是msr()
measures = msrs(c("regr.mse", "regr.mae"))
prediction$score(measures)

Tip:对于分类任务

  1. 相关指标,有的是适用于"prob"(概率值),有的是适用于"response"(标签结果)。

  2. 在评价前,可以修改默认的分类阈值,并查看相关的混淆矩阵

    1
    2
    
    prediction$set_threshold(0.7)
    prediction$confusion
    

4. Resampling 交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 查看所支持的交叉验证方式,详细介绍参考上述链接
resampling_dt = as.data.table(rsmps())

# 1) 定义交叉验证方式
# three-fold CV
cv3 = rsmp("cv", folds = 3)
# 2-repeats 5-fold CV
rcv25 = rsmp("repeated_cv", repeats = 2, folds = 5)

# 2) 定义任务/算法
tsk_mtcars = tsk("mtcars")
lrn_rpart = lrn("regr.rpart")

# 3) 执行交叉验证
rr = resample(tsk_mtcars, lrn_rpart, cv3)
rr

rr$score(msr("regr.mse"))     #计算每折的指标结果
rr$aggregate(msr("regr.mse")) #计算综合的评价结果

rr$predictions()  #查看每折对验证集的预测结果
rr$prediction()   #查看全部对验证集的预测结果

Tips:

  1. 在执行交叉验证前,可提前查看每折的样本分割方式

    1
    2
    3
    4
    
    cv3$instantiate(tsk_mtcars)
    # 查看第一次拆分的训练集与测试集id
    cv3$train_set(1)
    cv3$test_set(1)
    
  2. 在resample()分析结果默认不保存每折的训练模型,可通过设置参数进行保存

    1
    2
    
    rr = resample(tsk_mtcars, lrn_rpart, cv3, store_models = T)
    rr$learners[[1]]$model
    

特殊CV:由于数据样本的特殊性,在样本分割时可进行额外的设置

1
2
3
4
5
6
7
# 如下将penguins任务的year列设置为group的依据
tsk_grp = tsk("penguins")
tsk_grp$set_col_roles("year", "group")

rsmp_loo = rsmp("loo") #使用leave-one-out方法,每次留一组的样本作为测试集
rsmp_loo$instantiate(tsk_grp)
table(tsk_grp$data(rows = rsmp_loo$train_set(1), cols = "year"))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
tsk_str = tsk("penguins")
# set species to have both the 'target' and 'stratum' column role
tsk_str$set_col_roles("species", c("target", "stratum"))
cv3$instantiate(tsk_str)

fold1 = prop.table(table(tsk_str$data(rows = cv3$test_set(1),
                                      cols = "species")))
fold2 = prop.table(table(tsk_str$data(rows = cv3$test_set(2),
                                      cols = "species")))
rbind("Fold 1" = fold1, "Fold 2" = fold2)

5. Benchmark 基准

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 1) 设计基准测试任务
# 不同的数据
tasks = list(tsk("penguins"), tsk("sonar"))
# 不同的算法
learners = list(lrn("classif.featureless"), lrn("classif.rpart"))
# 不同的交叉验证
resamplings = list(rsmp("cv"), rsmp("subsampling"))

# Set a seed to ensure reproducibility of the resampling instantiation
set.seed(123)
design = benchmark_grid(tasks, learners, resamplings)
design

# 2) 执行
bmr = benchmark(design)
bmr

bmr$score(msr(c("classif.acc")))
bmr$aggregate(msr(c("classif.acc")))

6. HPO超参数优化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 1) 设置算法超参数空间
# 查看支持的超参数类型
lrn("classif.svm")$param_set
# 使用to_tune()设置候选范围
learner = lrn("classif.svm",
              type  = "C-classification",
              kernel = to_tune(c("radial", "linear")),   #字符向量设置候选范围
              cost  = to_tune(1e-1, 1e5, logscale = T),  #对于指数分布范围,进行log转换,合理采样
              gamma = to_tune(c(0.1, 0.2, 0.5, 1))       #对于数值类型,也可以指定候选值
)
learner


# 2) 停止搜索的条件,参考链接详细介绍 https://mlr-org.com/terminators.html
as.data.table(trm())
# 例如
trm("none")                  #不做限制
trm("evals", n_evals = 5)    #只搜索5次
trm("run_time", secs = 1800) #运行时间
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 3) 创建一个HPO实例
tsk_sonar = tsk("sonar") 

learner = lrn("classif.svm",
  cost  = to_tune(1e-1, 1e5, logscale = T),
  gamma = to_tune(1e-1, 1),
  kernel = "radial",
  type = "C-classification"
)

instance = ti(
  task = tsk_sonar,                        #数据任务
  learner = learner,                       #学习算法(超参数空间)
  resampling = rsmp("cv", folds = 3),      #交叉验证
  measures = msr("classif.ce"),            #评价指标
  terminator = trm("none")                 #停止条件
)

instance

Tips:

  1. 也可以ps()设置超参数空间,提供给ti()的search_space参数,详细参看https://mlr3book.mlr-org.com/chapters/chapter4/hyperparameter_optimization.html#sec-tune-ps
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
search_space = ps(
  cost  = p_dbl(lower = 1e-1, upper = 1e5, trafo = function(x) exp(x)),
  gamma = p_dbl(lower=0.1, upper = 1)
)

instance = ti(
  task = tsk_sonar,                        
  learner = lrn("classif.svm",kernel = "radial",type = "C-classification"),                      
  resampling = rsmp("cv", folds = 3),     
  measures = msr("classif.ce"),            
  terminator = trm("none"),
  search_space = search_space
)
  1. mlr3团队基于以往研究,收集了常见机器学习模型的常用超参数设置,可供用户直接使用
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
library(mlr3tuningspaces)
as.data.table(mlr_tuning_spaces)

lts_svm = lts("classif.svm.default")
lts_svm

instance = ti(
  task = tsk_sonar,                        
  learner = lrn("classif.svm",type = "C-classification"),                      
  resampling = rsmp("cv", folds = 3),     
  measures = msr("classif.ce"),            
  terminator = trm("none"),
  search_space = lts_svm
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 4) 定义超参数搜索方法,参看链接说明 https://mlr-org.com/tuners.html
tnr()
as.data.table(tnr())
# 例如
tnr("random_search")  #随机搜索
tnr("grid_search")    #笛卡尔积
tnr("mbo")            #贝叶斯优化
tnr("design_points", design = design)   #自定义搜索范围
# 选择一种方法对上述示例进行调参
tuner = tnr("grid_search", resolution = 5, batch_size = 10) #5*5
tuner = tnr("grid_search", param_resolutions = c(cost = 5, gamma = 3), batch_size = 10) #3*5
# resolution表示对连续型超参数范围,取n个值
# batch_size可以简单理解为线程数
tuner
tuner$optimize(instance)

instance$result$learner_param_vals #查看最优模型对应的超参数
instance$archive                   #查看所有遍历的训练结果


# 5) 使用最佳超参数建立最终模型
lrn_svm_tuned = lrn("classif.svm")
lrn_svm_tuned$param_set$values = instance$result_learner_param_vals
lrn_svm_tuned$train(tsk_sonar)$model   #所有数据建模

嵌套交叉验证(Nested-CV):Nested resampling is a method to compare models and to estimate the generalization performance of a tuned model, however, this is the performance based on multiple different configurations (one from each outer fold) and not performance based on a single configuration.

其直观、清晰的步骤解释可参看这张图的介绍:https://mlr3book.mlr-org.com/chapters/chapter4/Figures/mlr3book_figures-11.svg

  • 首先,需要介绍auto_tuner()。它将HPO过程包装为一个类似learner的对象
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
at = auto_tuner(
  tuner = tnr("grid_search", resolution = 3, batch_size = 5),
  learner = lrn("classif.svm",type = "C-classification"),
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.ce"),
  search_space = lts("classif.svm.default")
)
at

split = partition(tsk_sonar)
at$train(tsk_sonar, row_ids = split$train)
#首先通过训练集交叉验证找到一组最优超参数
#然后使用该超参数在全部训练集数据建模

at$predict(tsk_sonar, row_ids = split$test)$score() #测试集评价
  • 然后,Nested CV本质可以理解为对上述示例,再次进行外部的交叉验证
1
2
3
4
5
6
7
8
9
rr = resample(tsk_sonar, at, rsmp("cv", folds = 2), store_models = TRUE)

#查看外部交叉验证的结果
rr$score(msr("classif.ce"))
rr$aggregate(msr("classif.ce"))

#查看内部交叉验证的结果
extract_inner_tuning_results(rr)
extract_inner_tuning_archives(rr)

7. FS 特征选择

思路1:基于mlr3filters包,对每个特征进行单独打分,再设置阈值筛选

  • 所有支持的打分方法有:https://mlr3filters.mlr-org.com/
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# 1) 定义任务
tsk_pen = tsk("penguins")
# 2) 选择打分指标
flt_gain = flt("information_gain")
# flt_cor = flt("correlation", method = "spearman")
# 3) 计算打分
flt_gain$calculate(tsk_pen)
as.data.table(flt_gain)    

# 最后设定预测(打分高于设定阈值,或者打分排名高于设定阈值),筛选预期特征集合

此外,有两个依赖于特定算法的打分指标

importance”: If only a single filter method is to be used, the authors recommend to use a feature importance filter using random forest permutation importance

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# importance是比较推荐的一种打分,需要结合支持的算法(其中,比较推荐随机森林)                                 
as.data.table(mlr_learners)[
  sapply(properties, function(x) "importance" %in% x)]
lrn("classif.ranger")$param_set$levels$importance
# [1] "none"               "impurity"           "impurity_corrected" "permutation"   

lrn_ranger = lrn("classif.ranger", importance = "permutation")
# 由于ranger算法的Properties没有“missing”,也就是说不能处理缺失值,这里需要手动处理一下
tsk_pen = tsk("penguins")
tsk_pen$filter(tsk_pen$row_ids[complete.cases(tsk_pen$data())])
                                   
flt_importance = flt("importance", learner = lrn_ranger)
flt_importance$calculate(tsk_pen)
as.data.table(flt_importance)

selected_features":对于部分ML模型(例如决策树),其在建模过程中,仅会选择部分模型,可以作为特征选择的依据。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# 查看支持的ML算法
as.data.table(mlr_learners)[
  sapply(properties, function(x) "selected_features" %in% x)]

# 示例         
tsk_pen = tsk("penguins")
lrn_rpart = lrn("classif.rpart")
flt_selected = flt("selected_features", learner = lrn_rpart)
flt_selected$calculate(tsk_pen)
as.data.table(flt_selected)

思路2:基于mlr3fselect包,直接遍历、搜索最优的特征集合(类似于HPO过程)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# 1) 定义实例
instance = fsi(
  task = tsk("penguins"),
  learner = lrn("classif.rpart"),
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.acc"),
  terminator = trm("evals", n_evals = 20) #停止条件
)
# 2) 选择搜索算法
# as.data.table(fs())
fselector = fs("random_search")
# fselector = fs("genetic_search")

# 3) 执行搜索
fselector$optimize(instance)

instance$result_feature_set     #搜索的最优特征集合
as.data.table(instance$archive) #搜索记录

auto_fselector() 支持基于嵌套交叉验证的特征集合选择,详见 https://mlr3book.mlr-org.com/chapters/chapter6/feature_selection.html#sec-autofselect

8. Pipeline流程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# 实例化模块
po_pca = po("pca", center = TRUE)
po_pca

#输入数据
tsk_small = tsk("penguins_simple")$select(c("bill_depth", "bill_length"))
poin = list(tsk_small$clone()$filter(1:5))

# "模块训练"
poout = po_pca$train(poin) # poin: Task in a list
poout # list with a single element 'output'
poout$output$head()

# "模块预测"
tsk_onepenguin = tsk_small$clone()$filter(42)
poin = list(tsk_onepenguin) # list
poout = po_pca$predict(poin)
poout[[1]]$data()

Tips:模块的训练/预测数据格式都需要为list对象

  • 然后:Graph可以将多个模块连接为一个"图”,进行整体的操作
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 1) 定义两个独立模块
po_mutate = po("mutate",
               mutation = list(bill_ratio = ~bill_length / bill_depth)
)
po_scale = po("scale")

# 2) 使用 %>>% 连接成图
graph = po_mutate %>>% po_scale
graph
graph$plot(horizontal = TRUE)
graph$pipeops

# 3) 图整体的训练/预测等操作
res_train = graph$train(tsk_small)
res_train

tsk_onepenguin = tsk_small$clone()$filter(42)
res_predict = graph$predict(tsk_onepenguin)
res_predict[[1]]$head()
  • 最常用的是将数据预处理与建模过程进行结合,再转为GraphLearner对象
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# 1) 将模块(含learner)连接成图
lrn_logreg = lrn("classif.log_reg")
graph = po("imputesample") %>>% lrn_logreg
# graph = po("imputesample") %>>% po("learner", lrn_logreg)
graph$plot(horizontal = TRUE)

# 2) 将Graph转为GraphLearner
glrn_sample = as_learner(graph)
glrn_mode = as_learner(po("imputemode") %>>% lrn_logreg)

# 3) 进行learner的常规操作,例如交叉验证、Benchmark等
design = benchmark_grid(tsk("pima"), list(glrn_sample, glrn_mode),
                        rsmp("cv", folds = 3))
bmr = benchmark(design)
aggr = bmr$aggregate()[, .(learner_id, classif.ce)]
aggr

Tips: https://mlr3book.mlr-org.com/chapters/chapter8/non-sequential_pipelines_and_tuning.html

(1) 上述介绍的graph,多为sequential类型(一条直线)。此外也可以通过gunion,常见non-sequential类型的Pipeline。

(2)此外,也内置了许多预定义好的Pipeline(ppl()),共方便的使用。例如:

  • ppl("bagging", graph): 可以方便地对一种算法进行bagging集成;
  • ppl("robustify"): 整合了多种数据的预处理方法,从而提供适合算法的输入数据。