在对转录组数据分析时,分组比较的差异分析前提时获得测序表达矩阵。

根据测序技术分为两种,对应的分析R包如下所示。

  • RNAseq的原始count表达矩阵
    • DESeq2
    • edgeR
    • limma
  • 芯片array标准化后的表达矩阵
    • limma

1、示例count表达矩阵

如下使用TCGAbiolinks包下载共9个癌症样本、35个正常样本的表达矩阵;其中涉及到8对配对样本(tumor与normal来自同一病人)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library(TCGAbiolinks)
library(SummarizedExperiment)
library(tidyverse)
query <- GDCquery(project = "TCGA-CHOL",
                  legacy = FALSE,
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification")
GDCdownload(query, files.per.chunk = 10)
data = GDCprepare(query, save = F)
rownames(data) = rowData(data)$gene_name
data = data[rowData(data)$gene_type=="protein_coding",]
count = assay(data, "unstranded")
count = count[!duplicated(rownames(count)),]
dim(count)
# [1] 19938    44
count[1:4,1:4]
#        TCGA-W5-AA2X-01A-11R-A41I-07 TCGA-W5-AA2X-11A-11R-A41I-07
# TSPAN6                         3310                         2322
# TNMD                              0                            0
# DPM1                           1881                          716
# SCYL3                           966                          315
#        TCGA-W5-AA33-01A-11R-A41I-07 TCGA-ZH-A8Y2-01A-11R-A41I-07
# TSPAN6                        11312                         6768
# TNMD                              1                            0
# DPM1                            968                         1273
# SCYL3                          1296                          774

meta = colData(data)[,c("barcode","sample", "patient","shortLetterCode")] %>% 
  as.data.frame()
meta$Group = meta$shortLetterCode # NT: Solid Tissue Normal; TP: Primary Tumor 
head(meta[,"Group",drop=F])
#                              Group
# TCGA-W5-AA2X-01A-11R-A41I-07    TP
# TCGA-W5-AA2X-11A-11R-A41I-07    NT
# TCGA-W5-AA33-01A-11R-A41I-07    TP
# TCGA-ZH-A8Y2-01A-11R-A41I-07    TP
# TCGA-ZH-A8Y6-01A-11R-A41I-07    TP
# TCGA-W5-AA2Z-01A-11R-A41I-07    TP
table(meta$Group)
# NT TP 
# 9 35

identical(colnames(count), rownames(meta))
# [1] TRUE


## 8对配对样本
meta_pair = meta[TCGAquery_MatchedCoupledSampleTypes(meta$barcode, c("NT","TP")), ]
# TCGA-W5-AA2I TCGA-W5-AA2Q TCGA-W5-AA2U TCGA-W5-AA2X TCGA-W5-AA30 TCGA-W5-AA31 TCGA-W5-AA34 TCGA-ZU-A8S4 
# 2            2            2            2            2            2            2            2 
count_pair = count[, meta_pair$barcode]
identical(meta_pair$barcode, colnames(count_pair))
#[1] TRUE

2、DESeq2

常规分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(DESeq2)
# 分组因子化:对照组在前,实验组在后
meta$Group = factor(meta$Group, levels = c("NT", "TP"))
dds = DESeqDataSetFromMatrix(countData = count,
                              colData = meta,
                              design= ~ Group)
# keep <- rowSums(counts(dds)) >= 10
# dds <- dds[keep,]

dds = DESeq(dds)
resultsNames(dds) # lists the coefficients
# [1] "Intercept"      "Group_TP_vs_NT"

res = results(dds, name="Group_TP_vs_NT")

deg = as.data.frame(res)
deg = deg[order(deg$padj), ]
head(deg)
#        baseMean log2FoldChange     lfcSE      stat       pvalue         padj
# MSMO1 7585.3612      -3.748488 0.2109579 -17.76889 1.231028e-70 1.105094e-66
# RMDN2  396.7567      -2.929035 0.1648278 -17.77027 1.201019e-70 1.105094e-66
# GCDH  2272.9263      -3.302603 0.1863973 -17.71809 3.040673e-70 1.819742e-66
# TCAIM 1239.0505      -2.235421 0.1303740 -17.14621 6.708695e-66 3.011198e-62
# KCNN2  360.4449      -5.504765 0.3234836 -17.01714 6.129148e-65 2.200854e-61
# USH2A  550.8569      -6.469569 0.3914803 -16.52591 2.388048e-61 6.125002e-58

count_log = log2(counts(dds, normalized=TRUE) + 1)
gene_exp = count_log["MSMO1", ,drop=F] %>% 
  t() %>% as.data.frame() %>% 
  dplyr::mutate(Group=meta$Group) 
ggplot(gene_exp, aes(x=Group, y=MSMO1)) + 
  geom_boxplot()
image-20220611150552173

配对分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
library(DESeq2)
dim(count_pair)
# [1] 19938    16
head(meta_pair[,c("Group","patient")])
#                              Group      patient
# TCGA-W5-AA2X-11A-11R-A41I-07    NT TCGA-W5-AA2X
# TCGA-W5-AA2Q-11A-11R-A41I-07    NT TCGA-W5-AA2Q
# TCGA-W5-AA30-11A-11R-A41I-07    NT TCGA-W5-AA30
# TCGA-ZU-A8S4-11A-11R-A41I-07    NT TCGA-ZU-A8S4
# TCGA-W5-AA31-11A-11R-A41I-07    NT TCGA-W5-AA31
# TCGA-W5-AA2I-11A-11R-A41I-07    NT TCGA-W5-AA2I
dds2 = DESeqDataSetFromMatrix(countData = count_pair,
                             colData = meta_pair,
                             design= ~ Group + patient)
dds2 = DESeq(dds2)
resultsNames(dds2) # lists the coefficients
# [1] "Intercept"                            "Group_TP_vs_NT"                      
# [3] "patient_TCGA.W5.AA2Q_vs_TCGA.W5.AA2I" "patient_TCGA.W5.AA2U_vs_TCGA.W5.AA2I"
# [5] "patient_TCGA.W5.AA2X_vs_TCGA.W5.AA2I" "patient_TCGA.W5.AA30_vs_TCGA.W5.AA2I"
# [7] "patient_TCGA.W5.AA31_vs_TCGA.W5.AA2I" "patient_TCGA.W5.AA34_vs_TCGA.W5.AA2I"
# [9] "patient_TCGA.ZU.A8S4_vs_TCGA.W5.AA2I"

res2 = results(dds2, name="Group_TP_vs_NT")
deg2 = as.data.frame(res2)
deg2 = deg2[order(deg2$padj), ]
head(deg2)
#        baseMean log2FoldChange     lfcSE      stat        pvalue          padj
# PCK2  22699.219      -5.653594 0.1854230 -30.49025 3.508987e-204 6.059670e-200
# GRHPR 17379.340      -3.574706 0.1551980 -23.03319 2.168223e-117 1.872152e-113
# KDM8   1614.762      -4.555835 0.1989067 -22.90438 4.201674e-116 2.418624e-112
# CTH    2367.082      -5.254396 0.2324677 -22.60270 4.077093e-113 1.760183e-109
# ADI1  18162.701      -3.714171 0.1659464 -22.38174 5.927946e-111 2.047394e-107
# PBLD   8315.518      -5.683135 0.2594054 -21.90831 2.164593e-106 6.230059e-103

library("BiocParallel")
# Parallelizing DESeq, results, and lfcShrink
# not supported on Windows
register(MulticoreParam(4))
dds = DESeq(dds, parallel = TRUE)

3、edgeR

常规分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(edgeR)
##(1) DGEList对象
# 分组因子字符串:对照组在前;实验组在后
Group = factor(meta$Group, levels = c("NT","TP"))
table(Group)
# Group
# NT TP 
# 9 35
dge.list = DGEList(counts=count,
                    group=Group)
dge.list$counts[1:4,1:4]
dge.list$samples %>% head()
#                              group lib.size norm.factors
# TCGA-W5-AA2X-01A-11R-A41I-07    TP 41184464            1
# TCGA-W5-AA2X-11A-11R-A41I-07    NT 36468189            1
# TCGA-W5-AA33-01A-11R-A41I-07    TP 43378374            1

##(2) 计算文库因子
dge.list = calcNormFactors(dge.list)  # Default  method = 'TMM'
dge.list$samples %>% head()
#                              group lib.size norm.factors
# TCGA-W5-AA2X-01A-11R-A41I-07    TP 41184464    1.2238691
# TCGA-W5-AA2X-11A-11R-A41I-07    NT 36468189    0.6258154
# TCGA-W5-AA33-01A-11R-A41I-07    TP 43378374    1.3504490 

##(3) 根据分组设计,进行差异分析
design = model.matrix(~ Group)
#### 如有其它协变量因素,放在主要变量之前即可,如下示例
# design = model.matrix(~ Gender + Age + Group)

head(design)
#   (Intercept) GroupTP
# 1           1       1
# 2           1       0
# 3           1       1
dge.list = estimateDisp(dge.list, design)
fit = glmFit(dge.list, design)
# Defaults to the last coefficient.
fit = glmLRT(fit) 
deg = topTags(fit, n = Inf)
deg = as.data.frame(deg)
head(deg)
#          logFC   logCPM       LR       PValue          FDR
# MSMO1 -3.636116 7.467688 371.8917 7.249780e-83 1.445461e-78
# GCDH  -3.189232 5.737307 359.9146 2.938892e-80 2.929781e-76
# KCNN2 -5.382560 3.055588 344.2997 7.387479e-77 4.909718e-73
# RCL1  -3.799208 5.786302 338.9246 1.094155e-75 5.453816e-72
# SC5D  -3.852832 6.657183 338.3414 1.465859e-75 5.845260e-72
# USH2A -6.352713 3.665099 329.5388 1.211265e-73 4.025034e-70

count_log = log2(cpm(dge.list) + 1)   # TMM normalized 
count_log[1:4,1:4]
gene_exp = count_log["MSMO1", ,drop=F] %>% 
  t() %>% as.data.frame() %>% 
  dplyr::mutate(Group=meta$Group) 
ggplot(gene_exp, aes(x=Group, y=MSMO1)) + 
  geom_boxplot()
image-20220611153117041

配对分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library(edgeR)
Group = factor(meta_pair$Group, levels = c("NT","TP"))
# 对照组在前;实验组在后
Patient = meta_pair$patient
dge.list = DGEList(counts=count_pair,
                   group=Group)
dge.list = calcNormFactors(dge.list)  # Default  method = 'TMM'

design = model.matrix(~ Patient + Group)
head(design)
#   (Intercept) PatientTCGA-W5-AA2Q PatientTCGA-W5-AA2U PatientTCGA-W5-AA2X PatientTCGA-W5-AA30
# 1           1                   0                   0                   1                   0
# 2           1                   1                   0                   0                   0
# 3           1                   0                   0                   0                   1
# 4           1                   0                   0                   0                   0
# 5           1                   0                   0                   0                   0
# 6           1                   0                   0                   0                   0
#   PatientTCGA-W5-AA31 PatientTCGA-W5-AA34 PatientTCGA-ZU-A8S4 GroupTP
# 1                   0                   0                   0       0
# 2                   0                   0                   0       0
# 3                   0                   0                   0       0
# 4                   0                   0                   1       0
# 5                   1                   0                   0       0
# 6                   0                   0                   0       0

dge.list = estimateDisp(dge.list, design)

fit = glmQLFit(dge.list, design, robust = TRUE)
fit = glmQLFTest(fit) 
# By default, the test is for the last coefficient in the design matrix,
deg = topTags(fit, n = Inf)
deg = as.data.frame(deg)
head(deg)
#              logFC   logCPM        F       PValue          FDR
# CTH      -5.168010 5.735511 451.8950 1.891581e-10 2.184341e-06
# KDM8     -4.469608 5.191592 439.9909 2.191134e-10 2.184341e-06
# PCK2     -5.566911 8.997974 385.1885 4.551845e-10 2.240078e-06
# IQGAP3    5.382710 4.043167 378.8504 4.985585e-10 2.240078e-06
# EPB41L4B -3.747267 6.347686 360.6129 6.533619e-10 2.240078e-06
# GCH1     -3.670607 5.705010 358.5600 6.741130e-10 2.240078e-06

4、limma

RNAseq差异分析

(1)常规分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
Group = factor(meta$Group, levels = c("NT","TP"))
table(Group)
# Group
# NT TP 
# 9 35
design = model.matrix(~ Group)
head(design)
#   (Intercept) GroupTP
# 1           1       1
# 2           1       0
# 3           1       1
# 4           1       1
# 5           1       1
# 6           1       1
v = voom(count,design,normalize="quantile")
fit = lmFit(v,design)
fit = eBayes(fit)
colnames(coef(fit))
#[1] "(Intercept)" "GroupTP"
tT = topTable(fit, adjust="fdr", sort.by="p", number=Inf)
head(tT)
#                logFC    AveExpr         t      P.Value    adj.P.Val        B
# RMDN2      -2.900127  2.5323698 -28.67963 1.336773e-29 2.665258e-25 56.90314
# USH2A      -6.535628  0.3847154 -23.73211 2.788056e-26 2.005247e-22 49.38508
# KCNN2      -5.460617  0.6268550 -23.68516 3.017223e-26 2.005247e-22 49.31589
# ESR1       -5.701844  1.4121136 -21.54736 1.278570e-24 6.373031e-21 45.80920
# AC132217.2 -5.094348 -5.1967122 -20.17979 1.659743e-23 6.430672e-20 42.06491
# SOX5       -4.944365  0.7859657 -20.10020 1.935201e-23 6.430672e-20 43.12095

(2)配对分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
Group = factor(meta_pair$Group, levels = c("NT","TP"))
table(Group)
# Group
# NT TP 
# 8  8 
Patient = factor(meta_pair$patient)
table(Patient)
Patient
# TCGA-W5-AA2I TCGA-W5-AA2Q TCGA-W5-AA2U TCGA-W5-AA2X TCGA-W5-AA30 TCGA-W5-AA31 TCGA-W5-AA34 TCGA-ZU-A8S4 
# 2            2            2            2            2            2            2            2 
design = model.matrix(~ Patient + Group)
head(design)
#   (Intercept) PatientTCGA-W5-AA2Q PatientTCGA-W5-AA2U PatientTCGA-W5-AA2X PatientTCGA-W5-AA30
# 1           1                   0                   0                   1                   0
# 2           1                   1                   0                   0                   0
# 3           1                   0                   0                   0                   1
# 4           1                   0                   0                   0                   0
# 5           1                   0                   0                   0                   0
# 6           1                   0                   0                   0                   0
#   PatientTCGA-W5-AA31 PatientTCGA-W5-AA34 PatientTCGA-ZU-A8S4 GroupTP
# 1                   0                   0                   0       0
# 2                   0                   0                   0       0
# 3                   0                   0                   0       0
# 4                   0                   0                   1       0
# 5                   1                   0                   0       0
# 6                   0                   0                   0       0
v = voom(count_pair, design, normalize="quantile")
fit = lmFit(v,design)
fit = eBayes(fit)
colnames(coef(fit))
# [1] "(Intercept)"         "PatientTCGA-W5-AA2Q" "PatientTCGA-W5-AA2U" "PatientTCGA-W5-AA2X"
# [5] "PatientTCGA-W5-AA30" "PatientTCGA-W5-AA31" "PatientTCGA-W5-AA34" "PatientTCGA-ZU-A8S4"
# [9] "GroupTP" 
tT = topTable(fit, coef = "GroupTP", adjust="fdr", sort.by="p", number=Inf)
head(tT)
#            logFC  AveExpr         t      P.Value    adj.P.Val        B
# BTD    -2.500130 5.412028 -37.40476 5.067356e-10 8.241491e-06 13.61382
# MCM3    2.535346 5.765008  35.09944 8.267119e-10 8.241491e-06 13.22475
# ABCC10  1.768879 3.760942  31.34388 1.973016e-09 9.926863e-06 12.32708
# FGGY   -2.559748 4.447616 -29.91541 2.822782e-09 9.926863e-06 12.07469
# ZCCHC2 -1.924153 4.452133 -29.51382 3.131304e-09 9.926863e-06 12.01657
# ACOT1  -4.146553 1.943388 -28.79614 3.782277e-09 9.926863e-06 11.27019

芯片数据差异分析

(1)GEO芯片数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
library(GEOquery)
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10)
gse = getGEO('GSE109169', getGPL = F) 
gse = gse[[1]]
exp_dat = exprs(gse) %>% as.data.frame()
dim(exp_dat)
# [1] 19076    50
exp_dat[1:4,1:4]
#         GSM2934481 GSM2934482 GSM2934483 GSM2934484
# 2315554       7.31       7.27       7.63       7.38
# 2315633       7.24       7.52       7.42       7.37
# 2315674       7.36       7.42       7.61       7.31
# 2315739       7.24       7.56       7.76       7.46
meta = pData(gse) %>% 
  dplyr::select(title, dplyr::ends_with("ch1"))
colnames(meta)=gsub("[:_]ch1", "", colnames(meta))
meta$Group = ifelse(meta$source_name=="breast cancer tissue",
                    "tumor", "normal")
# normal  tumor 
# 25     25 
meta$Patient = stringr::str_split(meta$title, "-", simplify=T)[,1]
table(meta$Patient)
# 2102 2110 2146 2181 2190 2280 2297 2353 2365 2397 2451 2454 2706 2780 2910 2917 2943 2972 2985 3156 3159 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
# 3197 3232 3297 3355 
# 2    2    2    2 
head(meta[,c("Group","Patient")])
#             Group Patient
# GSM2934481 normal    2102
# GSM2934482  tumor    2102
# GSM2934483 normal    2110
# GSM2934484  tumor    2110
# GSM2934485 normal    2146
# GSM2934486  tumor    2146
identical(rownames(meta), colnames(exp_dat))
# [1] TRUE

(2)常规分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
Group = factor(meta$Group, levels = c("normal","tumor"))
table(Group)
# Group
# normal  tumor 
# 25     25 
design=model.matrix(~  Group) 
head(design)
#   (Intercept) Grouptumor
# 1           1          0
# 2           1          1
# 3           1          0
# 4           1          1
# 5           1          0
# 6           1          1
fit = lmFit(exp_dat, design)
fit = eBayes(fit)
colnames(coef(fit))
# [1] "(Intercept)" "Grouptumor"
tT = topTable(fit, adjust="fdr", sort.by="p", number=Inf)
head(tT)
#           logFC AveExpr         t      P.Value    adj.P.Val        B
# 4004044 -2.6860  6.8706 -21.16464 6.424093e-27 1.225460e-22 50.47849
# 4000538 -4.1400  6.6420 -19.36848 3.622286e-25 3.454937e-21 46.66686
# 2591421 -2.2128  7.1468 -16.18728 9.688784e-22 6.160775e-18 39.10226
# 3830189 -1.8416  8.0504 -16.03976 1.433530e-21 6.836502e-18 38.72368
# 3519309 -2.1232  7.7964 -15.30913 1.034876e-20 3.948259e-17 36.80964
# 3745525 -1.0784  6.1008 -15.21411 1.344319e-20 4.274038e-17 36.55587

(3)配对分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
Group = factor(meta$Group, levels = c("normal","tumor"))
table(Group)
# Group
# normal  tumor 
# 25     25 
Patient = factor(meta$Patient)
table(Patient)
# Patient
# 2102 2110 2146 2181 2190 2280 2297 2353 2365 2397 2451 2454 2706 2780 2910 2917 2943 2972 2985 3156 3159 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
# 3197 3232 3297 3355 
# 2    2    2    2

design=model.matrix(~  Patient + Group) 
head(design)
#   Groupnormal Grouptumor
# 1           1          0
# 2           0          1
# 3           1          0
fit = lmFit(exp_dat, design)
fit <- eBayes(fit)
colnames(coef(fit))
# [1] "(Intercept)" "Patient2110" "Patient2146" "Patient2181" "Patient2190" "Patient2280" "Patient2297"
# [8] "Patient2353" "Patient2365" "Patient2397" "Patient2451" "Patient2454" "Patient2706" "Patient2780"
# [15] "Patient2910" "Patient2917" "Patient2943" "Patient2972" "Patient2985" "Patient3156" "Patient3159"
# [22] "Patient3197" "Patient3232" "Patient3297" "Patient3355" "Grouptumor"

tT <- topTable(fit, coef = "Grouptumor", adjust="fdr", sort.by="p", number=Inf)
head(tT)
#           logFC AveExpr         t      P.Value    adj.P.Val        B
# 4004044 -2.6860  6.8706 -23.34059 1.764542e-19 3.366041e-15 33.97414
# 4000538 -4.1400  6.6420 -21.21050 2.072006e-18 1.976279e-14 31.71755
# 3830189 -1.8416  8.0504 -19.34286 2.168752e-17 1.379037e-13 29.52627
# 2701109 -2.4704  7.0796 -17.61105 2.298621e-16 1.096212e-12 27.28943
# 3062082 -2.9824  9.0236 -17.27304 3.728465e-16 1.422484e-12 26.82740
# 2591421 -2.2128  7.1468 -17.03824 5.241945e-16 1.564468e-12 26.50125