1
2
library(mlr3verse)
library(tidyverse)

0、示例数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
data(Iowa, package = "lasso2")
head(Iowa)
#   Year Rain0 Temp1 Rain1 Temp2 Rain2 Temp3 Rain3 Temp4 Yield
# 1 1930 17.75  60.2  5.83  69.0  1.49  77.9  2.42  74.4  34.0
# 2 1931 14.76  57.5  3.83  75.0  2.72  77.2  3.30  72.6  32.9
# 3 1932 27.99  62.3  5.17  72.0  3.12  75.8  7.10  72.2  43.0
# 4 1933 16.76  60.5  1.64  77.8  3.45  76.4  3.01  70.5  40.0
# 5 1934 11.36  69.5  3.49  77.2  3.85  79.7  2.84  73.4  23.0
# 6 1935 22.71  55.0  7.00  65.9  3.35  79.4  2.42  73.6  38.4

##第10列为小麦的产量
##其余列为小麦生长不同阶段的天气情况
task_regr = as_task_regr(Iowa, target = "Yield")

1、KNN

  • 基于KNN,解决分类任务的步骤之前已经学习过,可参考之前的学习笔记;
  • 而用KNN解决回归问题与之十分类似,只是最后一步变为取k的最近样本的均值。(在分类问题中是取众数)

1.1 确定学习方法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
##(1) 定义学习器
learner = lrn("regr.kknn")
learner$param_set
#主要参数为k
#默认scale为TRUE,即对预测变量进行标准化

##(2) 超参数
search_space = ps(
  k = p_int(lower = 3, upper = 15)
)
design = data.frame(k = 3:15) %>% as.data.table()

##(3) 交叉验证与评价指标
resampling = rsmp("cv")
measure = msr("regr.mse")

1.2 选出最优超参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
instance = TuningInstanceSingleCrit$new(
  task = task_regr,
  learner = learner,
  resampling = resampling,
  measure = measure,
  terminator = trm("none"),
  search_space = search_space
)
tuner = tnr("design_points", design = design)  
future::plan("multisession")
tuner$optimize(instance)

#历史记录
as.data.table(instance$archive)[,1:2] %>% head()
#    k regr.mse
# 1: 3 98.39484
# 2: 4 95.16022
# 3: 5 94.21570
# 4: 6 93.94832
# 5: 7 93.65615
# 6: 8 93.95614

#最优超参数结果
instance$result_learner_param_vals 
#$k
#[1] 7

instance$result_y
# regr.mse 
# 93.65615

1.3 训练最终模型

1
2
3
learner$param_set$values$k = instance$result_learner_param_vals$k
learner$train(task_regr)
learner$model

2、随机森林

  • 由许多决策树组成的随机森林同样可以用于解决回归问题;

  • 在决策树解决分类问题时,每次决策节点选择哪一个预测变量是由其所能带来的最大基尼增益所决定的。

  • 在分类问题中,则是考虑划分所能带来的最小的残差平方和。具体来说对于每种划分,计算左右划分的残差平方和,相加,取最小值。

  • 其余步骤以及超参数与之前学习随机森林时基本相同。

$$ S_{split} = \sum_{i\in left}(y_i-\hat y_{left})^2 + \sum_{i\in right}(y_i-\hat y_{right})^2 $$

2.1 确定学习方法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
##(1) 定义学习器
learner = lrn("regr.ranger")
learner$param_set

##(2) 超参数
search_space = ps(
  num.trees = p_int(lower=1,upper=1000),
  mtry = p_int(lower=1,upper=9),
  min.node.size = p_dbl(lower=1, upper=5),
  max.depth = p_int(lower=1,upper=20)
)

design = expand.grid(num.trees=c(300, 500, 1000),
                     mtry=c(3, 6, 9),
                     min.node.size=c(1,2,3,4,5),
                     max.depth = c(3, 6, 9)) %>% as.data.table()


##(3) 交叉验证方法与模型评价指标
resampling = rsmp("cv")
measure = msr("regr.mse")

2.2 选出最优超参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
##创建实例
instance = TuningInstanceSingleCrit$new(
  task = task_regr,
  learner = learner,
  resampling = resampling,
  measure = measure,
  terminator = trm("none"),
  search_space = search_space
)
tuner = tnr("design_points", design = design)  

##优化超参数
future::plan("multisession")
tuner$optimize(instance)
as.data.table(instance$archive)[,1:5]
#      num.trees mtry min.node.size max.depth regr.mse
#   1:       300    3             1         3 92.69975
#   2:       500    3             1         3 91.72947
#   3:      1000    3             1         3 92.48906
#   4:       300    6             1         3 74.97336
#   5:       500    6             1         3 76.96392
#  ---                                                
# 210:       500    6             5         9 69.62499
# 211:      1000    6             5         9 71.07244
# 212:       300    9             5         9 63.80304
# 213:       500    9             5         9 63.28335
# 214:      1000    9             5         9 65.71797

instance$result_learner_param_vals
# $num.threads
# [1] 1

# $num.trees
# [1] 500

# $mtry
# [1] 9

# $min.node.size
# [1] 2

# $max.depth
# [1] 6

instance$result_y
#regr.mse 
#60.97301

2.3 训练最终模型

1
2
3
learner$param_set$values = instance$result_learner_param_vals
learner$train(task_regr)
learner$model

3、XGBoost

  • 由于还没有学习XGBoost应用于分类问题的原理与步骤,这里主要记录一下代码

3.1 确定学习方法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
##(1) 定义学习器
learner = lrn("regr.xgboost")
learner$param_set

##(2) 超参数
search_space = ps(
  eta = p_dbl(lower=0,upper=1),
  gamma = p_dbl(lower=1,upper=5),
  max_depth = p_int(lower = 1, upper = 10),
  min_child_weight = p_dbl(lower = 1, upper = 10),
  subsample = p_dbl(lower = 0.5, upper = 1),
  colsample_bytree = p_dbl(lower = 0.5, upper = 1),
  nrounds = p_int(lower = 1, upper = 50)
)
design = expand.grid(eta = c(0.01, 0.1, 0.5),
                     gamma = c(1, 3),
                     max_depth=c(4, 8),
                     min_child_weight = c(3, 6),
                     subsample = c(0.6, 0.9),
                     colsample_bytree = c(0.6, 0.9),
                     nrounds = c(10, 20),
                     stringsAsFactors = F) %>% as.data.table()

##(3) 交叉验证
resampling = rsmp("cv")
measure = msr("regr.mse")

3.2 选出最优超参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
instance = TuningInstanceSingleCrit$new(
  task = task_regr,
  learner = learner,
  resampling = resampling,
  measure = measure,
  terminator = trm("none"),
  search_space = search_space
)
tuner = tnr("design_points", design = design) 

future::plan("multisession")
tuner$optimize(instance)

##最优超参数
instance$result_learner_param_vals
# $nrounds
# [1] 20

# $nthread
# [1] 1

# $verbose
# [1] 0

# $eta
# [1] 0.5

# $gamma
# [1] 1

# $max_depth
# [1] 8

# $min_child_weight
# [1] 3

# $subsample
# [1] 0.9

# $colsample_bytree
# [1] 0.9
               
instance$result_y
# regr.mse 
# 54.71813 

3.3 训练最终模型

1
2
3
learner$param_set$values = instance$result_learner_param_vals
learner$train(task_regr)
learner$model