tidyr是tidyverse系列的组成工具包之一,其主要功能侧重于表格的reshaping and organizing. 下面简要学习其几个比较常用的函数。

1
library(tidyverse)

1. 表格长宽转换

类似reshape2包

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
df <- data.frame(
  id = paste0("sample_",1:3),
  time1 = c(10, 20, 30),
  time2 = c(15, 25, 35),
  time3 = c(20, 30, 40)
)
df
#         id time1 time2 time3
# 1 sample_1    10    15    20
# 2 sample_2    20    25    30
# 3 sample_3    30    35    40

1.1 pivot_longer()

  • 宽变长
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
df_long <- df %>%
  pivot_longer(cols = starts_with("time"), 
               names_to = "time_point", 
               values_to = "value")
head(df_long)
# # A tibble: 6 × 3
#   id       time_point value
#   <chr>    <chr>      <dbl>
# 1 sample_1 time1         10
# 2 sample_1 time2         15
# 3 sample_1 time3         20
# 4 sample_2 time1         20
# 5 sample_2 time2         25
# 6 sample_2 time3         30

1.2 pivot_wider()

  • 长变宽
1
2
3
4
5
6
7
8
9
df_wide <- df_long %>%
  pivot_wider(names_from = "time_point", values_from = "value")
df_wide
# # A tibble: 3 × 4
#   id       time1 time2 time3
#   <chr>    <dbl> <dbl> <dbl>
# 1 sample_1    10    15    20
# 2 sample_2    20    25    30
# 3 sample_3    30    35    40

2. 表格单列分割

1
2
3
4
5
6
7
8
9
df <- data.frame(
  id = 1:3,
  info = c("sp1_2010", "sp2_2020", "sp3_2025")
)
df
#   id     info
# 1  1 sp1_2010
# 2  2 sp2_2020
# 3  3 sp3_2025

2.1 separate_longer_delim()

  • 纵向分割–变长
1
2
3
4
5
6
7
8
9
df %>%
  separate_longer_delim(info, delim = "_")
#   id info
# 1  1  sp1
# 2  1 2010
# 3  2  sp2
# 4  2 2020
# 5  3  sp3
# 6  3 2025

2.2 separate_wider_delim()

  • 横向分割–变宽
1
2
3
4
5
6
7
8
df %>%
  separate_wider_delim(info, delim = "_", names =c("col_1", "col_2"))
# # A tibble: 3 × 3
#     id col_1 col_2
#   <int> <chr> <chr>
# 1     1 sp1   2010 
# 2     2 sp2   2020 
# 3     3 sp3   2025 

3. expand()排列组合

  • 生成两(or more)组向量的所有排列组合
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
samples = c("sp1","sp2","sp3")
years = c(1999, 1999, 2020)

df = data.frame(
  col_1 = samples,
  col_2 = years
)
df
#   col_1 col_2
# 1   sp1  1999
# 2   sp2  1999
# 3   sp3  2020

df %>% 
  tidyr::expand(col_1, col_2)
# # A tibble: 6 × 2
#   col_1 col_2
#   <chr> <dbl>
# 1 sp1    1999
# 2 sp1    2020
# 3 sp2    1999
# 4 sp2    2020
# 5 sp3    1999
# 6 sp3    2020
  • Alternative ways
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
samples = c("sp1","sp2","sp3")
years = c(1999, 2020)
# keep with same length
length(years) = 3
years
# [1] 1999  2020   NA
df = data.frame(
  col_1 = samples,
  col_2 = years
)
df %>% 
  tidyr::expand(col_1, col_2) %>% 
  na.omit()
  • R语言基础函数实现
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
samples = c("sp1","sp2","sp3")
years = c(1999, 2020)
expand.grid(samples, years)
#   Var1 Var2
# 1  sp1 1999
# 2  sp2 1999
# 3  sp3 1999
# 4  sp1 2020
# 5  sp2 2020
# 6  sp3 2020

combn(samples, m = 2)
#       [,1]  [,2]  [,3] 
# [1,] "sp1" "sp1" "sp2"
# [2,] "sp2" "sp3" "sp3"