1、示例数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
library(ggplot2)
library(patchwork)
library(carData) #示例数据
head(Salaries) #教职工资情况
#        rank discipline yrs.since.phd yrs.service  sex salary
# 1      Prof          B            19          18 Male 139750
# 2      Prof          B            20          16 Male 173200
# 3  AsstProf          B             4           3 Male  79750
# 4      Prof          B            45          39 Male 115000
# 5      Prof          B            40          41 Male 141500
# 6 AssocProf          B             6           6 Male  97000
table(Salaries$rank, Salaries$sex)
#           Female Male
# AsstProf      11   56
# AssocProf     10   54
# Prof          18  248

2、基础用法

1
2
3
4
5
6
7
8
9
p1 = ggplot(Salaries, aes(x=rank)) +
  geom_bar()
# 贴近x轴
p2 = ggplot(Salaries, aes(x=rank)) +
  geom_bar() + scale_y_continuous(expand=c(0,0)) 
# 映射填充颜色
p3 =ggplot(Salaries, aes(x=rank, fill=rank)) +
  geom_bar()
p1 + p2 + p3

3、position=参数调整分组形式

1
2
3
4
5
6
7
8
# 默认
p1 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="stack") + labs(title='position="stack"') 
p2 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="dodge") + labs(title='position="dodge"')
p3 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="fill") + labs(title='position="fill"')
p1 + p2 + p3 + plot_layout(guides = 'collect')

4、stat=参数设置频数统计方式

  • stat="count"(default) 表示从给定的数据里,统计每个类别出现的次数;
    • 此时aes()只需要给定x参数即可;
  • stat="identity"表示直接指定每种类别的频数;
    • 此时aes()除了需要给定x参数交代类别,还需要指定y参数表示频数值。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
library(tidyverse)
dat = Salaries %>% group_by(rank) %>% 
  dplyr::summarise(n=n()) %>% as.data.frame()
dat
#        rank   n
# 1  AsstProf  67
# 2 AssocProf  64
# 3      Prof 266
p1 = ggplot(dat, aes(x=rank, y=n, fill=rank)) +
  geom_bar(stat = "identity")

dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n()) %>% as.data.frame()
dat
#        rank    sex   n
# 1  AsstProf Female  11
# 2  AsstProf   Male  56
# 3 AssocProf Female  10
# 4 AssocProf   Male  54
# 5      Prof Female  18
# 6      Prof   Male 248

p2 = ggplot(dat, aes(x=rank, y=n, fill=sex)) +
  geom_bar(stat = "identity")

p1 + p2

5、geom_text()添加频数注释

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
dat = Salaries %>% group_by(rank) %>% 
  dplyr::summarise(n=n())
p1=ggplot(dat, aes(x=rank, y=n)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=n), vjust = -0.2)
  # vjust<0,上移;vjust>0,下移

dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n())
p2=ggplot(dat, aes(x=rank, y=n, fill=sex)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label=n), vjust = -0.2,
            position=position_dodge(width=0.9))

p1 + p2

6、调整柱子的顺序

  • 如果只有一种分组方式,通过设置类别的因子水平即可。
    • 或者使用scale_x_discrete(c(.....))自定义顺序也可以实现很方便的修改
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
ggplot(Salaries, aes(x=rank)) +
  geom_bar() +
  scale_x_discrete(limits=c("AsstProf",  "AssocProf", "Prof"))

p1=ggplot(iris, aes(x=Species, y=Sepal.Width)) +
    geom_boxplot() +
    ggtitle("default factor levels")
p2=ggplot(iris, aes(x=fct_reorder(Species, Sepal.Width), y=Sepal.Width)) +
    geom_boxplot() +
    ggtitle("fct_reorder default levels")
p3=ggplot(iris, aes(x=fct_reorder(Species, Sepal.Width, .desc=T), y=Sepal.Width)) +
    geom_boxplot() +
    ggtitle("fct_reorder descent levels")
library(patchwork)
p1 | p2 | p3
  • 但如果更复杂的情况–组内排序。
    • 例:5个学生的三门课程成绩,按照每门学科分组,将5个学生按照成绩从低到高排序(或者从高到低排序)。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
grade = data.frame(
  subject=rep(c("Chineses","Math","English"), each=5),
  name=rep(c("A","B","C","D","E"),3),
  score=c(79,65,70,94,82,76,87,80,81,89,88,79,82,95,90))

# 先按学科均分从高到低
# 然后每个学科内,成绩从低到高学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=T)
library(tidytext)
p1 = ggplot(grade, aes(x=reorder_within(name,score,subject), y=score, fill=name)) +
  geom_bar(stat = "identity") +
  scale_x_reordered() +
  facet_wrap(subject~. ,scales = "free_x")

# 先按学科均分从低到高
# 然后每个学科内,成绩从高到低学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=F)
library(tidytext)
p2 = ggplot(grade, aes(x=reorder_within(name,-score,subject), y=score, fill=name)) +
  geom_bar(stat = "identity") +
  scale_x_reordered() +
  facet_wrap(subject~. ,scales = "free_x")

p1 + p2 + plot_layout(guides = 'collect')

注意reorder_within(个体,值,分组),还需要设置scale_x_reordered() , facet_wrap(variable~. ,scales = "free_x")

7、双向柱状图

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n())
dat_m = dat %>% 
  filter(sex=="Male") %>% 
  mutate(lab=n) %>% as.data.frame()

dat_f = dat %>% 
  filter(sex=="Female") %>% 
  mutate(lab=-1*n) %>% as.data.frame() #注意要取相反值

ggplot() + 
  geom_bar(data=dat_m, aes(x=rank, y=lab, fill=sex),
           stat = "identity", position = 'dodge') +
  geom_text(data=dat_m, aes(x=rank, y=lab, label=n, vjust=-0.25)) +
  geom_bar(data=dat_f, aes(x=rank, y=lab, fill=sex),
           stat = "identity", position = 'dodge') +
  geom_text(data=dat_f, aes(x=rank, y=lab, label=n, vjust=1.25)) +
  scale_y_continuous(breaks=c(200, 100, 0, -20),
                     labels=c("200", "100", "0","20")) +
  scale_fill_manual(values=c("#0072B5","#BC3C28"))
image.png

8、误差棒error bar

  • 使用上面第6点的示例数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#首先定义可以计算分组离散变量的均值与sd值的函数
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

df1 = data_summary(grade, varname="score", 
             groupnames=c("subject"))
#    subject score        sd
# 1 Chineses  69.2 12.716131
# 2     Math  84.4  8.561542
# 3  English  85.8  5.019960

p1=ggplot(df1, aes(x=subject, y=score)) + 
  geom_bar(stat="identity", color="black") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=.2) #双向
p2=ggplot(df1, aes(x=subject, y=score)) + 
  geom_bar(stat="identity", color="black") +
  geom_errorbar(aes(ymin=score, ymax=score+sd), width=.2)  #单向
p1 + p2
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
df2 <- data_summary(Salaries, varname="salary", 
                    groupnames=c("rank", "sex"))
#        rank    sex    salary        sd
# 1  AsstProf Female  78049.91  9371.996
# 2  AsstProf   Male  81311.46  7901.343
# 3 AssocProf Female  88512.80 17965.286
# 4 AssocProf   Male  94869.70 12890.817
# 5      Prof Female 121967.61 19619.583
# 6      Prof   Male 127120.82 28213.808
ggplot(df2, aes(x=rank, y=salary, fill=sex)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=salary-sd, ymax=salary+sd), width=.2,
                position=position_dodge(0.9)) +
  theme_classic() +
  scale_fill_manual(values=c('#999999','#E69F00'))