1
2
3
4
5
6
7
8
9
library(tidyverse)
# -- Attaching packages ----------------------------------------------------- tidyverse 1.3.1 --
# √ ggplot2 3.3.5     √ purrr   0.3.4
# √ tibble  3.1.2     √ dplyr   1.0.7
# √ tidyr   1.1.3     √ stringr 1.4.0
# √ readr   2.0.0     √ forcats 0.5.1
# -- Conflicts -------------------------------------------------------- tidyverse_conflicts() --
# x dplyr::filter() masks stats::filter()
# x dplyr::lag()    masks stats::lag()

1、表格筛选

1.1 select 筛选列

col1:col3 选取起止范围的列;

!c(col1, col2) 删除列;

everything() 取所有列;

last_col() 获取最后一行;

starts_with()ends_with()contains()matches 匹配具有特征列名的列;

all_ofany_ofwhere 匹配具有特征列值的列。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
data(mtcars)
mtcars=mtcars[1:6,]
#                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
# Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
# Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
# Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
# Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
# Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
# Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
mtcars %>% select(mpg)                           #单列
mtcars %>% select("mpg")                         #字符串列名(非必要)
mtcars %>% select(mpg:hp)                        #多列
mtcars %>% select(last_col(1))                   #倒数第二列
mtcars %>% select(!c(mpg, am))                   #反选
mtcars %>% select(starts_with("c"))              #列名以p开头的
mtcars %>% select(where(is.numeric))             #数值列
mtcars %>% select(where(function(x) max(x)>100)) #最大值大于200的列
mtcars %>% select(hp, everything())              #把指定列放在第一列                       

1.2 选择行

1.2.1 filter定义条件筛选行

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
##(1)全局筛选
data(mtcars)
mtcars %>% filter(gear == 3)
mtcars %>% filter(gear == 3 & disp > 200) #与
mtcars %>% filter(gear == 3 | disp > 200) #或

##当使用变量名代替列名时
test="gear"
mtcars %>% filter(.data[[test]] == 3)


##(2)分组筛选:搭配group_by()
mtcars %>% 
  group_by(vs) %>%
  filter(disp == max(disp)) #筛选按 vs 分组里,disp的最大值
mtcars %>% 
  group_by(vs) %>%
  filter(disp > mean(disp)) #筛选按 vs 分组里,disp 里的大于均值水平的。

1.2.2 slice系列方法选择行

slice() 按行索引取

slice_min(), slice_max() 按列值大小取最值行

slice_sample() 随机抽取行

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
mtcars
mtcars %>% slice(1)
mtcars %>% slice(30:n())

mtcars %>% slice_max(mpg, n=5)
mtcars %>% slice_max(mpg, prop = 0.2)

mtcars %>% slice_sample(n = 5)
mtcars %>% slice_sample(n = 5, weight_by = wt)

##同样可以搭配group_by()分组操作

1.2.3 distinct去重复行

1
2
3
4
5
6
7
8
9
#共有2列10行,列内容均由1、2、3组成
df <- tibble(
  x = sample(3, 10, rep = TRUE),
  y = sample(3, 10, rep = TRUE)
)

df %>% distinct()                                #去除完全相同行
distinct(df, x, .keep_all=T)                     #将某一列去重复
df %>% group_by(x) %>% distinct(y, .keep_all=T)  #分组去重

1.2.4 count统计频数

1
2
3
4
5
data(mtcars)
data(mtcars)
mtcars %>% count(cyl)           #返回cyl频数表
mtcars %>% count(cyl, vs)       #两个分组变量组合频数
mtcars %>% add_count(cyl)       #在原有表格基础上增添一列,为相应level的counts总数

2、summarise表格统计

mean(),median

sd(),mad()

min(),max(),quantile()

nn_distinct 分别表示统计行数,与非重复行数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
##(1)全局统计
mtcars %>% summarise(mean_disp = mean(disp))
mtcars %>% summarise(mean_disp = min(disp),
                     n = n(),
                     n_gear = n_distinct(gear),
                     test = sd(.data[["mpg"]])) #字符串变量名

##(2)分组统计:搭配group_by()
mtcars %>%
  group_by(cyl) %>%  
  summarise(mean = mean(disp), n = n()) 

##(3)分组多列统计:搭配group_by()、across()
mtcars %>%
  group_by(cyl) %>% 
  summarise(across(c(wt,qsec,mpg), median, .names = "median_{.col}")) 

3、mutate修改/创建列

mutate()会在原有表格基础上修改/新增列

transmute()仅返回修改或者新增的列

3.1 修改列

1
2
3
4
5
data(mtcars)
mtcars=mtcars[1:6,]
mtcars %>% mutate(wt = wt+10)                            #指定单列运算
mtcars %>% mutate(across(c(mpg,disp), ~ (.x)^2))         #指定多列运算
mtcars %>% mutate(across(where(is.numeric), as.integer)) #所有数值列改为整数

3.2 新增列

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
data(mtcars)
mtcars=mtcars[1:6,]
mtcars %>% mutate(wt_new = wt+10)
mtcars %>% mutate(wt_new = wt+10, .before = 1)  #新增的列在第1列
mtcars %>% mutate(wt_new = wt+10, .after = wt)  #新增的列放在wt列的后面
##使用动态变量名命名新建的列
test="new"
mtcars %>% mutate('{test}' := 1)

##(1) 根据数值列新增rank列
x <- c(1,2,2,3,4)
rank(x)         #相同数取平均秩
# [1] 1.0 2.5 2.5 4.0 5.0
row_number(x)   #相同数,排在前面的排名高
# [1] 1 2 3 4 5
min_rank(x)     #有相同排名,会有gap的情况
# [1] 1 2 2 4 5
dense_rank(x)   #有相同排名,没有gap的情况
# 1] 1 2 2 3 4
mtcars %>% mutate(mpg_rank = min_rank(mpg)) 
mtcars %>% mutate(mpg_rank = min_rank(-1*mpg)) 

##(2)case_when新增层级编码列
mtcars %>% mutate(mpg_grade = case_when(mpg <= 20 ~ "C",
                                        mpg <= 22 ~ "B",
                                        TRUE      ~ "A"))

4、arrange表格排序

1
2
3
4
5
6
7
8
mtcars %>% arrange(disp)        #desc升序排列
mtcars %>% arrange(desc(disp))  #desc降序排列
arrange(mtcars, carb, disp)     #先按carb升序排列,再按disp升序排列

##分组排序
mtcars %>% 
  group_by(cyl) %>%
  arrange(mpg)  

5、关于表格的列名/行名

  • rename: 修改列名
1
2
3
4
5
6
#修改列名
mtcars %>% rename(mpg_new=mpg, cyl_new=cyl)

#使用变量修改列名
test="new"
mtcars %>% rename('{test}':=mpg)
  • rownames_to_columncolumn_to_rownames行名与列的转换
1
2
3
4
5
6
7
8
9
mtcars %>% head
#行名变为列
rownames_to_column(mtcars, var = "car") %>% head
mtcars %>% 
  rownames_to_column(var = "car") %>% head
mtcars %>% 
  rownames_to_column(var = "car") %>%
#指定列变为行名
  column_to_rownames(var = "car") %>% head

6、两个表格合并

  • 示例表格数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
df1 <- data.frame(x1 = c(1, 2, 3, 4, 5, 6),
                    y1 = c("a", "b", "c", "d","e","f"))
#   x1 y1
# 1  1  a
# 2  2  b
# 3  3  c
# 4  4  d
# 5  5  e
# 6  6  f

df2 <- data.frame(x2 = c(1, 2, 3, 4, 5, 6),
                    y2 = c("q", "w", "e", "r","t","y"))
#   x2 y2
# 1  1  q
# 2  2  w
# 3  3  e
# 4  4  r
# 5  5  t
# 6  6  y

df3 <- data.frame(x3 = c( 2, 4, 5, 6),
                    y3 = c( "a", "s", "d","f"))
#   x3 y3
# 1  2  a
# 2  4  s
# 3  5  d
# 4  6  f

根据不同的分析目的,有多种合并方法~

6.1 inner_join

  • 取交集,即保留同时在两个表中的观测
1
2
3
4
5
6
# merge(df1, df3, by.x = "y1", by.y = "y3")
inner_join(df1, df3, by=c("y1"="y3"))
#   x1 y1 x3
# 1  1  a  2
# 2  4  d  5
# 3  6  f  6

注意:如果这两个列的名相同,合并时直接交代行名即可,例如inner_join(df1_1, df3_1, by="y")merge(df1_1, df3_1, by = "y"),下同~

6.2 left_join

  • 左连接–保留左边表格的所有观测,缺失值用NA值代替
1
2
3
4
5
6
7
8
9
# merge(df1, df3, by.x = "y1", by.y = "y3", all.x = T)
left_join(df1, df3, by=c("y1"="y3"))
#   x1 y1 x3
# 1  1  a  2
# 2  2  b NA
# 3  3  c NA
# 4  4  d  5
# 5  5  e NA
# 6  6  f  6

6.3 right_join

  • 右连接–保留右边表格的所有观测,缺失值用NA值代替
1
2
3
4
5
6
7
8
9
# merge(df1, df2, by.x = "y1", by.y = "y2", all.y = T)
right_join(df1, df2, by=c("y1"="y2"))
#   x1 y1 x2
# 1  5  e  3
# 2 NA  q  1
# 3 NA  w  2
# 4 NA  r  4
# 5 NA  t  5
# 6 NA  y  6

6.4 full_join

  • 外连接–保留两个表格里的所有观测,缺失值用NA值代替
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# merge(df1, df2, by.x = "y1", by.y = "y2", all.x = T, all.y = T)
full_join(df1, df2, by=c("y1"="y2"))
#    x1 y1 x2
# 1   1  a NA
# 2   2  b NA
# 3   3  c NA
# 4   4  d NA
# 5   5  e  3
# 6   6  f NA
# 7  NA  q  1
# 8  NA  w  2
# 9  NA  r  4
# 10 NA  t  5
# 11 NA  y  6

补充tidyr

  • separate_rows: 将一行的内容按特定分隔符拆分为多行
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
df <- data.frame(
  x = 1:3,
  y = c("a", "d,e,f", "g,h")
)
#   x     y
# 1 1     a
# 2 2 d,e,f
# 3 3   g,h

df %>% 
  tidyr::separate_rows(y, sep = ",")
#       x y    
#    <int> <chr>
# 1     1 a    
# 2     2 d    
# 3     2 e    
# 4     2 f    
# 5     3 g    
# 6     3 h