1
2
3
4
5
6
7
8
9
10
|
library(mlr3verse)
library(tidyverse)
tsks() #预置数据任务
lrns() #机器学习算法
msrs() #性能评价指标
as.data.table()
|
1. Task 任务#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
tsk()
#预置数据任务
as.data.table(tsk())
tsk("mtcars")
#自定义任务
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
#target参数指定标签列,id参数(可选)设置任务名
as_task_classif()
#支持对任务对象进行数据查看、修改等操作,不一一列举,详见上述链接
#有两点需要重点说明
tsk_mtcars$row_ids #不等于一般的行序号。一旦定义任务,row_ids就确定不变了,可以理解为row name。方便后续数据分割。
tsk_mtcars_another = tsk_mtcars$clone() #想要独立的复制任务时,需要使用clone()
|
对于分类任务基本类似。值得注意的是在二分类问题时,需要进一步指定阳性标签
1
2
3
4
5
|
data(Sonar, package = "mlbench")
# specifying the positive class:
tsk_classif = as_task_classif(Sonar, target = "Class", positive = "R")
tsk_classif$positive
tsk_classif$positive = "M" #修改
|
2. Learner 算法#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
# mlr3learners包支持~10种常见的机器学习算法,建议在建模时优先尝试
# mlr3extralearners包支持了更多种(>100)算法【需要单独安装】
# 0) 查看所有算法简介(或者点击上述链接)
learners_dt = as.data.table(lrns())
learners_dt
learners_dt %>%
dplyr::filter(grepl("mlr3learners",packages)) %>%
dplyr::filter(task_type=="classif")
# 1) 定义并使用算法
lrn_rpart = lrn("regr.rpart")
#在分类问题,可额外设置 predict_type = "prob"/"response" 表示预测概率值/类别结果
# 2) 简单分割数据为训练集与测试集
tsk_mtcars = as_task_regr(mtcars, target = "mpg", id = "cars")
splits = partition(tsk_mtcars)
str(splits)
# 3) 训练、查看模型
lrn_rpart$train(tsk_mtcars, row_ids = splits$train)
lrn_rpart$model
# 4) 模型预测
prediction = lrn_rpart$predict(tsk_mtcars, row_ids = splits$test) #测试集
prediction2 = lrn_rpart$predict_newdata(mtcars_new) #新数据
|
关于模型的超参数:
- 上述训练示例,使用了算法的默认超参数;
- 也可以在训练前自定义设置一组超参数
1
2
3
4
5
6
7
8
9
10
|
lrn_rpart = lrn("regr.rpart")
lrn_rpart$param_set #查看支持的超参数类型
lrn_rpart$param_set$values #查看当前设置的超参数值
#设置超参数方法
lrn_rpart = lrn("regr.rpart", maxdepth = 1)
#or
lrn_rpart$param_set$values$maxdepth = 2
#or
lrn_rpart$param_set$set_values(xval = 2, cp = 0.5)
|
3. Measure 指标#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
# mlr3measures包提供了常见分类或回归任务的评价指标
# 0) 查看所有指标
measures_dt = as.data.table(msrs())
measures_dt %>%
dplyr::filter(grepl("mlr3measures",packages)) %>%
dplyr::filter(task_type=="classif")
# 1)前景提要
lrn_rpart = lrn("regr.rpart")
tsk_mtcars = tsk("mtcars")
splits = partition(tsk_mtcars)
lrn_rpart$train(tsk_mtcars, splits$train)
prediction = lrn_rpart$predict(tsk_mtcars, splits$test)
# 2)选择指标
measure = msr("regr.mae")
measure
# 3)指标评价
prediction$score(measure)
# 4)支持同时多个指标。注意,此时需要使用msrs(),而不是msr()
measures = msrs(c("regr.mse", "regr.mae"))
prediction$score(measures)
|
Tip:对于分类任务
-
相关指标,有的是适用于"prob"(概率值),有的是适用于"response"(标签结果)。
-
在评价前,可以修改默认的分类阈值,并查看相关的混淆矩阵
1
2
|
prediction$set_threshold(0.7)
prediction$confusion
|
4. Resampling 交叉验证#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# 查看所支持的交叉验证方式,详细介绍参考上述链接
resampling_dt = as.data.table(rsmps())
# 1) 定义交叉验证方式
# three-fold CV
cv3 = rsmp("cv", folds = 3)
# 2-repeats 5-fold CV
rcv25 = rsmp("repeated_cv", repeats = 2, folds = 5)
# 2) 定义任务/算法
tsk_mtcars = tsk("mtcars")
lrn_rpart = lrn("regr.rpart")
# 3) 执行交叉验证
rr = resample(tsk_mtcars, lrn_rpart, cv3)
rr
rr$score(msr("regr.mse")) #计算每折的指标结果
rr$aggregate(msr("regr.mse")) #计算综合的评价结果
rr$predictions() #查看每折对验证集的预测结果
rr$prediction() #查看全部对验证集的预测结果
|
Tips:
-
在执行交叉验证前,可提前查看每折的样本分割方式
1
2
3
4
|
cv3$instantiate(tsk_mtcars)
# 查看第一次拆分的训练集与测试集id
cv3$train_set(1)
cv3$test_set(1)
|
-
在resample()分析结果默认不保存每折的训练模型,可通过设置参数进行保存
1
2
|
rr = resample(tsk_mtcars, lrn_rpart, cv3, store_models = T)
rr$learners[[1]]$model
|
特殊CV:由于数据样本的特殊性,在样本分割时可进行额外的设置
1
2
3
4
5
6
7
|
# 如下将penguins任务的year列设置为group的依据
tsk_grp = tsk("penguins")
tsk_grp$set_col_roles("year", "group")
rsmp_loo = rsmp("loo") #使用leave-one-out方法,每次留一组的样本作为测试集
rsmp_loo$instantiate(tsk_grp)
table(tsk_grp$data(rows = rsmp_loo$train_set(1), cols = "year"))
|
1
2
3
4
5
6
7
8
9
10
|
tsk_str = tsk("penguins")
# set species to have both the 'target' and 'stratum' column role
tsk_str$set_col_roles("species", c("target", "stratum"))
cv3$instantiate(tsk_str)
fold1 = prop.table(table(tsk_str$data(rows = cv3$test_set(1),
cols = "species")))
fold2 = prop.table(table(tsk_str$data(rows = cv3$test_set(2),
cols = "species")))
rbind("Fold 1" = fold1, "Fold 2" = fold2)
|
5. Benchmark 基准#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
# 1) 设计基准测试任务
# 不同的数据
tasks = list(tsk("penguins"), tsk("sonar"))
# 不同的算法
learners = list(lrn("classif.featureless"), lrn("classif.rpart"))
# 不同的交叉验证
resamplings = list(rsmp("cv"), rsmp("subsampling"))
# Set a seed to ensure reproducibility of the resampling instantiation
set.seed(123)
design = benchmark_grid(tasks, learners, resamplings)
design
# 2) 执行
bmr = benchmark(design)
bmr
bmr$score(msr(c("classif.acc")))
bmr$aggregate(msr(c("classif.acc")))
|
6. HPO超参数优化#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
# 1) 设置算法超参数空间
# 查看支持的超参数类型
lrn("classif.svm")$param_set
# 使用to_tune()设置候选范围
learner = lrn("classif.svm",
type = "C-classification",
kernel = to_tune(c("radial", "linear")), #字符向量设置候选范围
cost = to_tune(1e-1, 1e5, logscale = T), #对于指数分布范围,进行log转换,合理采样
gamma = to_tune(c(0.1, 0.2, 0.5, 1)) #对于数值类型,也可以指定候选值
)
learner
# 2) 停止搜索的条件,参考链接详细介绍 https://mlr-org.com/terminators.html
as.data.table(trm())
# 例如
trm("none") #不做限制
trm("evals", n_evals = 5) #只搜索5次
trm("run_time", secs = 1800) #运行时间
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
# 3) 创建一个HPO实例
tsk_sonar = tsk("sonar")
learner = lrn("classif.svm",
cost = to_tune(1e-1, 1e5, logscale = T),
gamma = to_tune(1e-1, 1),
kernel = "radial",
type = "C-classification"
)
instance = ti(
task = tsk_sonar, #数据任务
learner = learner, #学习算法(超参数空间)
resampling = rsmp("cv", folds = 3), #交叉验证
measures = msr("classif.ce"), #评价指标
terminator = trm("none") #停止条件
)
instance
|
Tips:
- 也可以
ps()
设置超参数空间,提供给ti()
的search_space参数,详细参看https://mlr3book.mlr-org.com/chapters/chapter4/hyperparameter_optimization.html#sec-tune-ps
1
2
3
4
5
6
7
8
9
10
11
12
13
|
search_space = ps(
cost = p_dbl(lower = 1e-1, upper = 1e5, trafo = function(x) exp(x)),
gamma = p_dbl(lower=0.1, upper = 1)
)
instance = ti(
task = tsk_sonar,
learner = lrn("classif.svm",kernel = "radial",type = "C-classification"),
resampling = rsmp("cv", folds = 3),
measures = msr("classif.ce"),
terminator = trm("none"),
search_space = search_space
)
|
- mlr3团队基于以往研究,收集了常见机器学习模型的常用超参数设置,可供用户直接使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
library(mlr3tuningspaces)
as.data.table(mlr_tuning_spaces)
lts_svm = lts("classif.svm.default")
lts_svm
instance = ti(
task = tsk_sonar,
learner = lrn("classif.svm",type = "C-classification"),
resampling = rsmp("cv", folds = 3),
measures = msr("classif.ce"),
terminator = trm("none"),
search_space = lts_svm
)
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# 4) 定义超参数搜索方法,参看链接说明 https://mlr-org.com/tuners.html
tnr()
as.data.table(tnr())
# 例如
tnr("random_search") #随机搜索
tnr("grid_search") #笛卡尔积
tnr("mbo") #贝叶斯优化
tnr("design_points", design = design) #自定义搜索范围
# 选择一种方法对上述示例进行调参
tuner = tnr("grid_search", resolution = 5, batch_size = 10) #5*5
tuner = tnr("grid_search", param_resolutions = c(cost = 5, gamma = 3), batch_size = 10) #3*5
# resolution表示对连续型超参数范围,取n个值
# batch_size可以简单理解为线程数
tuner
tuner$optimize(instance)
instance$result$learner_param_vals #查看最优模型对应的超参数
instance$archive #查看所有遍历的训练结果
# 5) 使用最佳超参数建立最终模型
lrn_svm_tuned = lrn("classif.svm")
lrn_svm_tuned$param_set$values = instance$result_learner_param_vals
lrn_svm_tuned$train(tsk_sonar)$model #所有数据建模
|
嵌套交叉验证(Nested-CV):Nested resampling is a method to compare models and to estimate the generalization performance of a tuned model, however, this is the performance based on multiple different configurations (one from each outer fold) and not performance based on a single configuration.
其直观、清晰的步骤解释可参看这张图的介绍:https://mlr3book.mlr-org.com/chapters/chapter4/Figures/mlr3book_figures-11.svg
- 首先,需要介绍
auto_tuner()
。它将HPO过程包装为一个类似learner的对象
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
at = auto_tuner(
tuner = tnr("grid_search", resolution = 3, batch_size = 5),
learner = lrn("classif.svm",type = "C-classification"),
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
search_space = lts("classif.svm.default")
)
at
split = partition(tsk_sonar)
at$train(tsk_sonar, row_ids = split$train)
#首先通过训练集交叉验证找到一组最优超参数
#然后使用该超参数在全部训练集数据建模
at$predict(tsk_sonar, row_ids = split$test)$score() #测试集评价
|
- 然后,Nested CV本质可以理解为对上述示例,再次进行外部的交叉验证
1
2
3
4
5
6
7
8
9
|
rr = resample(tsk_sonar, at, rsmp("cv", folds = 2), store_models = TRUE)
#查看外部交叉验证的结果
rr$score(msr("classif.ce"))
rr$aggregate(msr("classif.ce"))
#查看内部交叉验证的结果
extract_inner_tuning_results(rr)
extract_inner_tuning_archives(rr)
|
7. FS 特征选择#
思路1:基于mlr3filters包,对每个特征进行单独打分,再设置阈值筛选
- 所有支持的打分方法有:https://mlr3filters.mlr-org.com/
1
2
3
4
5
6
7
8
9
10
|
# 1) 定义任务
tsk_pen = tsk("penguins")
# 2) 选择打分指标
flt_gain = flt("information_gain")
# flt_cor = flt("correlation", method = "spearman")
# 3) 计算打分
flt_gain$calculate(tsk_pen)
as.data.table(flt_gain)
# 最后设定预测(打分高于设定阈值,或者打分排名高于设定阈值),筛选预期特征集合
|
此外,有两个依赖于特定算法的打分指标
“importance”: If only a single filter method is to be used, the authors recommend to use a feature importance filter using random forest permutation importance
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# importance是比较推荐的一种打分,需要结合支持的算法(其中,比较推荐随机森林)
as.data.table(mlr_learners)[
sapply(properties, function(x) "importance" %in% x)]
lrn("classif.ranger")$param_set$levels$importance
# [1] "none" "impurity" "impurity_corrected" "permutation"
lrn_ranger = lrn("classif.ranger", importance = "permutation")
# 由于ranger算法的Properties没有“missing”,也就是说不能处理缺失值,这里需要手动处理一下
tsk_pen = tsk("penguins")
tsk_pen$filter(tsk_pen$row_ids[complete.cases(tsk_pen$data())])
flt_importance = flt("importance", learner = lrn_ranger)
flt_importance$calculate(tsk_pen)
as.data.table(flt_importance)
|
“selected_features":对于部分ML模型(例如决策树),其在建模过程中,仅会选择部分模型,可以作为特征选择的依据。
1
2
3
4
5
6
7
8
9
10
|
# 查看支持的ML算法
as.data.table(mlr_learners)[
sapply(properties, function(x) "selected_features" %in% x)]
# 示例
tsk_pen = tsk("penguins")
lrn_rpart = lrn("classif.rpart")
flt_selected = flt("selected_features", learner = lrn_rpart)
flt_selected$calculate(tsk_pen)
as.data.table(flt_selected)
|
思路2:基于mlr3fselect包,直接遍历、搜索最优的特征集合(类似于HPO过程)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# 1) 定义实例
instance = fsi(
task = tsk("penguins"),
learner = lrn("classif.rpart"),
resampling = rsmp("cv", folds = 3),
measure = msr("classif.acc"),
terminator = trm("evals", n_evals = 20) #停止条件
)
# 2) 选择搜索算法
# as.data.table(fs())
fselector = fs("random_search")
# fselector = fs("genetic_search")
# 3) 执行搜索
fselector$optimize(instance)
instance$result_feature_set #搜索的最优特征集合
as.data.table(instance$archive) #搜索记录
|
auto_fselector()
支持基于嵌套交叉验证的特征集合选择,详见 https://mlr3book.mlr-org.com/chapters/chapter6/feature_selection.html#sec-autofselect
8. Pipeline流程#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# 实例化模块
po_pca = po("pca", center = TRUE)
po_pca
#输入数据
tsk_small = tsk("penguins_simple")$select(c("bill_depth", "bill_length"))
poin = list(tsk_small$clone()$filter(1:5))
# "模块训练"
poout = po_pca$train(poin) # poin: Task in a list
poout # list with a single element 'output'
poout$output$head()
# "模块预测"
tsk_onepenguin = tsk_small$clone()$filter(42)
poin = list(tsk_onepenguin) # list
poout = po_pca$predict(poin)
poout[[1]]$data()
|
Tips:模块的训练/预测数据格式都需要为list对象
- 然后:Graph可以将多个模块连接为一个"图”,进行整体的操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
# 1) 定义两个独立模块
po_mutate = po("mutate",
mutation = list(bill_ratio = ~bill_length / bill_depth)
)
po_scale = po("scale")
# 2) 使用 %>>% 连接成图
graph = po_mutate %>>% po_scale
graph
graph$plot(horizontal = TRUE)
graph$pipeops
# 3) 图整体的训练/预测等操作
res_train = graph$train(tsk_small)
res_train
tsk_onepenguin = tsk_small$clone()$filter(42)
res_predict = graph$predict(tsk_onepenguin)
res_predict[[1]]$head()
|
- 最常用的是将数据预处理与建模过程进行结合,再转为GraphLearner对象
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
# 1) 将模块(含learner)连接成图
lrn_logreg = lrn("classif.log_reg")
graph = po("imputesample") %>>% lrn_logreg
# graph = po("imputesample") %>>% po("learner", lrn_logreg)
graph$plot(horizontal = TRUE)
# 2) 将Graph转为GraphLearner
glrn_sample = as_learner(graph)
glrn_mode = as_learner(po("imputemode") %>>% lrn_logreg)
# 3) 进行learner的常规操作,例如交叉验证、Benchmark等
design = benchmark_grid(tsk("pima"), list(glrn_sample, glrn_mode),
rsmp("cv", folds = 3))
bmr = benchmark(design)
aggr = bmr$aggregate()[, .(learner_id, classif.ce)]
aggr
|
Tips: https://mlr3book.mlr-org.com/chapters/chapter8/non-sequential_pipelines_and_tuning.html
(1) 上述介绍的graph,多为sequential类型(一条直线)。此外也可以通过gunion,常见non-sequential类型的Pipeline。
(2)此外,也内置了许多预定义好的Pipeline(ppl()
),共方便的使用。例如:
ppl("bagging", graph)
: 可以方便地对一种算法进行bagging集成;
ppl("robustify")
: 整合了多种数据的预处理方法,从而提供适合算法的输入数据。