purrr包是tidyverse系列中用于函数编程的工具,很多用法类似R base的apply系列函数。现参考相关资料,学习其基本用法。
purrr包的核心是一系列map函数;如下所示,其参数主要有3部分
1
2
|
library(tidyverse)
map(.x, .f, ...)
|
.x
即迭代对象,支持3类–列表, 向量, 表格(每一列);
.f
即处理函数,有不同的表达形式;
...
表示设置.f
的固定参数。
1. 迭代输出输出#
输入是list
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
test = rnorm(12)
names(test) = rep(LETTERS[1:4],each=3)
test = split(test, rep(1:3,4))
# $`1`
# A B C D
# -1.0632186 -0.6765097 1.0856251 0.3742525
# $`2`
# A B C D
# -0.9695107 -2.4663956 -0.8396377 -1.1041372
# $`3`
# A B C D
# 0.05977038 1.54819774 1.66301988 -0.25377065
## 遍历每个列表向量中name为A的元素
map(test, function(x) x["A"])
# $`1`
# A
# -1.063219
# $`2`
# A
# -0.9695107
# $`3`
# A
# 0.05977038
## 下面两种是map支持的快捷方式,可达到同样的效果
map(test, 'A')
map(test, 1)
|
输入是表格
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
df = data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
map(df, mean, na.rm=TRUE)
# $a
# [1] -0.6680202
# $b
# [1] 0.1260203
# $c
# [1] -0.1479432
# $d
# [1] -0.3777024
df %>% map(sd)
|
特定类型输出
如上map
默认输出为list
- 对于输出结果是长度为1的情况,
map_lgl()
用于输出逻辑型向量;map_dbl()
用于输出双精度型向量;map_chr()
用于输出字符型向量。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
map_dbl(df, mean)
# a b c d
# -0.6680202 0.1260203 -0.1479432 -0.3777024
map_chr(df, mean)
# a b c d
# "-0.668020" "0.126020" "-0.147943" "-0.377702"
# Warning message:
# Automatic coercion from double to character was deprecated in purrr 1.0.0.
# ℹ Please use an explicit call to `as.character()` within `map_chr()` instead.
map_lgl(df, mean)
# Error in `map_lgl()`:
# ℹ In index: 1.
# ℹ With name: a.
# Caused by error:
# ! Can't coerce from a number to a logical vector.
|
- 对于输出结果是等长的情况,
map_dfr()
可按行合并为表格;map_dfc()
可按列合并为表格。
1
2
3
4
5
6
7
8
9
10
|
map_dfr(df, quantile)
# # A tibble: 4 × 5
# `0%` `25%` `50%` `75%` `100%`
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -1.72 -1.30 -0.534 -0.251 0.911
# 2 -1.07 -0.514 0.149 0.926 1.04
# 3 -1.28 -0.908 -0.104 0.503 1.01
# 4 -1.81 -0.962 -0.609 0.140 1.41
map_dfc(df, quantile)
|
2. 函数表达方式#
单参数变量情况
1
2
3
4
5
6
7
8
9
10
11
|
test = list(L1=c('A','B','C'),L2=c('X','Y','Z'))
## 方式1
my_fun <- function(x) paste(x, collapse = " | ")
map(test, my_fun)
## 方式2
map(test, paste, collapse = " | ")
## 方式3
map(test, ~ paste(., collapse = " | "))
|
多参数变量情况
1
2
3
4
5
6
7
8
9
10
11
|
mu = list(5, 10, -3)
sigma = list(1, 5, 10)
## 方式1
map(1:3, ~ rnorm(5, mu[[.]], sigma[[.]]))
## 方式2
map2(mu, sigma, rnorm, n = 5)
## 方式3
map2(mu, sigma, ~ rnorm(mean=.x, sd=.y, n = 5))
|
1
2
3
4
5
6
|
## 等价于上述的结果
pmap(list(n=c(5,5,5), mean=mu, sd=sigma), rnorm)
## 此外pmap支持表格作为批量参数迭代(每一列作为一个参数)
pmap_dbl(mtcars[1:4,1:2], function(mpg, cyl){mpg+cyl})
# [1] 27.0 27.0 26.8 27.4
|
3. 其它用法#
3.1 报错不中断#
1
2
3
4
5
|
map(list(1, "a", 10), log)
# Error in `map()`:
# ℹ In index: 2.
# Caused by error:
# ! non-numeric argument to mathematical function
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
map(list(1, "a", 10), safely(log))
# [[1]]
# [[1]]$result
# [1] 0
# [[1]]$error
# NULL
# [[2]]
# [[2]]$result
# NULL
# [[2]]$error
# <simpleError in .Primitive("log")(x, base): non-numeric argument to mathematical function>
# [[3]]
# [[3]]$result
# [1] 2.302585
# [[3]]$error
# NULL
|
1
2
|
map_dbl(list(1, "a", 10),possibly(log, NA))
# [1] 0.000000 NA 2.302585
|
3.2 walk重在过程#
- map系列以及apply家族的最终目的是将输入迭代函数处理,得到结果值。
- walk函数更适用于不需要返回值的场景,例如屏幕输出,保存文件,保存图片等场景。
- walk2可类比map2,pwalk可类比pmap
1
2
|
walk(LETTERS[1:3], print)
walk2(LETTERS[1:3], 1:3, ~print(paste(.x, .y)))
|
other examples:https://www.tidyverse.org/blog/2023/05/purrr-walk-this-way/
3.3 furrr包多线程#
1
2
3
4
5
6
7
8
9
|
plan(multisession, workers = 3)
1:10 %>%
future_map(rnorm, n = 10, .options = furrr_options(seed = 123)) %>%
future_map_dbl(mean)
future_walk(c(2, 2, 2), ~Sys.sleep(.x))
plan(multisession)
|
4. purrr其它用法#
list转换
1
2
3
4
5
6
7
8
|
test = list(L1=c('A','B','C'),L2=c('X','Y','Z'))
transpose(test)[1]
# [[1]]
# [[1]]$L1
# [1] "A"
# [[1]]$L2
# [1] "X"
|
reduce结合
1
2
3
4
5
6
7
8
9
|
test = list(L1=c('A','B','C'),L2=c('X','C','Z'))
reduce(test, paste)
# [1] "A X" "B C" "C Z"
reduce(test, intersect)
# [1] "C"
reduce(1:4, `*`)
# 24
|
cross_df排列组合
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
test = list(L1=c('A','B','C'),L2=c('X','Y','Z'))
cross_df(test)
# # A tibble: 9 × 2
# L1 L2
# <chr> <chr>
# 1 A X
# 2 B X
# 3 C X
# 4 A Y
# 5 B Y
# 6 C Y
# 7 A Z
# 8 B Z
# 9 C Z
## 上述对于较大空间排列处理较慢,此时可选择 tidyr::expand_grid()
expand_grid(x=test$L1, y=test$L2)
# # A tibble: 9 × 2
# x y
# <chr> <chr>
# 1 A X
# 2 A Y
# 3 A Z
# 4 B X
# 5 B Y
# 6 B Z
# 7 C X
# 8 C Y
# 9 C Z
|