1、计算公式#
|
sample 1 |
sample 2 |
…….. |
sample k |
Gene 1 |
10 |
12 |
|
30 |
Gene 2 |
20 |
25 |
|
60 |
…… |
… |
… |
… |
… |
Gene n |
0 |
0 |
… |
1 |
对于(n*k)表达矩阵中k个样本的n个基因的count表达数据。
在任意样本(列)中,基因i的长度为l
,count表达值为q
;则基因i的FPKM与TPM标准化方式计算如下
$$
\begin{align}
& FPKM_i = \frac{\frac{q_i}{l_i/10^3}}{\sum_{j=1}^nq_j/10^6} = \frac{\frac{q_i}{l_i}}{{\sum_{j=1}^nq_j}}*10^9 \tag{1} \
\
& TPM_i = \frac{\frac{q_i}{l_i/10^3}}{\sum_{j=1}^n(\frac{q_j}{l_j/10^3})/10^6} =
\frac{\frac{q_i}{l_i}}{\sum_{j=1}^n(\frac{q_j}{l_j})} * 10^6 \tag{2}
\end{align}
$$
- FPKM : fragments per kilobase of exon per million mapped fragments.
- TPM:transcript per million, and the sum of all TPM values is the same in all samples.
2、基因长度#
2.1 gtf注释外显子之和#
- 参考https://www.biostars.org/p/456800/,计算基因的所有外显子长度之和作为基因长度。
- 基因以及对应外显子坐标信息来自基因组注释文件gtf,下载方式参考之前Refgenie笔记。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
library(GenomicFeatures)
#如下选取的是hg19版本的注释基因组文件
txdb <- makeTxDbFromGFF("gencode.v32lift37.annotation.gtf",format="gtf")
exons_per_gene <- exonsBy(txdb,by="gene")
gene_len_from_exon <- as.data.frame(sum(width(reduce(exons_per_gene))))
colnames(gene_len_from_exon)[1] = "length"
gene_len_from_exon$ENSEMBL = str_split(rownames(gene_len_from_exon),"[.]",
simplify = T)[,1]
head(gene_len_from_exon)
# length ENSEMBL
# ENSG00000000003.15_3 4536 ENSG00000000003
# ENSG00000000005.6_3 1476 ENSG00000000005
# ENSG00000000419.12_4 1207 ENSG00000000419
##接下来将基因ENSEMBL名转为Symbol格式
gene_ids<-AnnotationDbi::select(org.Hs.eg.db, keys=gene_len_from_exon$ENSEMBL,
columns="SYMBOL", #目标格式
keytype="ENSEMBL") #目前的格式
gene_ids = gene_ids %>%
dplyr::distinct(ENSEMBL, .keep_all = T) %>%
na.omit()
gene_len = dplyr::left_join(gene_len_from_exon, gene_ids) %>%
dplyr::distinct(ENSEMBL, .keep_all = T) %>%
dplyr::distinct(SYMBOL, .keep_all = T)
dim(gene_len)
# [1] 34102 3
head(gene_len)
# length ENSEMBL SYMBOL
# 1 4536 ENSG00000000003 TSPAN6
# 2 1476 ENSG00000000005 TNMD
# 3 1207 ENSG00000000419 DPM1
|
2.2 TCGAbiolinks包内置的基因长度信息#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
##(1) hg19
geneInfo_hg19 = TCGAbiolinks::geneInfo %>% na.omit() %>%
as.data.frame()
head(geneInfo_hg19)
##(1) hg38
geneInfo_hg38 = TCGAbiolinks::geneInfoHT %>%
tibble::rownames_to_column("ENSEMBL")
library(org.Hs.eg.db)
gene_ids = AnnotationDbi::select(org.Hs.eg.db, keys=geneInfo_hg38$ENSEMBL,
columns=c("SYMBOL"), #目标格式
keytype="ENSEMBL") #目前的格式
gene_ids = gene_ids %>%
distinct(ENSEMBL, .keep_all = T) %>%
distinct(SYMBOL, .keep_all = T) %>%
na.omit()
head(gene_ids)
geneInfo_hg38 = dplyr::inner_join(geneInfo_hg38, gene_ids) %>%
tibble::column_to_rownames("SYMBOL")
head(geneInfo_hg38)
|
3、定义函数#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
##(1) counts_to_tpm
counts_to_tpm <- function(counts, geneLength) {
# Process one column at a time.
tpm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
# log的减法相当于除法:每一列(单个样本的count)/对应基因的长度
rate = log(counts[,i]) - log(geneLength)
#上一步更新后的新的细胞文库大小
denom = log(sum(exp(rate)))
#按照细胞文库标准化
exp(rate - denom + log(1e6))
}))
colnames(tpm) <- colnames(counts)
rownames(tpm) <- rownames(counts)
return(tpm)
}
##(2) counts_to_fpkm
counts_to_fpkm <- function(counts, geneLength) {
fpkm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
N <- sum(counts[,i])
exp(log(counts[,i]) - log(geneLength) - log(N) + log(1e9))
}))
colnames(fpkm) <- colnames(counts)
rownames(fpkm) <- rownames(counts)
return(fpkm)
}
##(3) fpkm_to_Tpm
fpkm_to_tpm <- function(fpkm) {
exp(log(fpkm) - log(colsum(fpkm)) + log(1e6))
}
|
4、示例分析#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
##(1)基因表达矩阵
exp_count = fread("GEO/GSE81861_CRC_NM_all_cells_COUNT.csv.gz",data.table = F) %>%
dplyr::mutate(gene=str_split(V1,"_",simplify = T)[,2]) %>%
dplyr::distinct(gene, .keep_all = T) %>%
tibble::column_to_rownames("gene") %>%
dplyr::select(!V1) %>%
dplyr::filter(rownames(.) %in% gene_len$SYMBOL)
dim(exp_count)
# [1] 26258 266
exp_count[1:4,1:4]
# RHC3934__Bcell__.7DEA7B RHC3944__Bcell__.7DEA7B RHC3962__Tcell__.C6E879 RHC4003__Bcell__.7DEA7B
# TSPAN6 0 0 0 0
# TNMD 0 0 0 0
# DPM1 0 0 0 0
# SCYL3 0 0 0 1
##(2)基因长度
gene_len_sub = gene_len[match(rownames(exp_count),gene_len$SYMBOL), ]
gene_len_sub = data.frame(length=gene_len_sub$length,
row.names = gene_len_sub$SYMBOL)
head(gene_len_sub)
# length
# TSPAN6 4536
# TNMD 1476
# DPM1 1207
identical(rownames(exp_count), rownames(gene_len_sub))
#[1] TRUE
##(3) count 2 TPM
exp_tpm = counts_to_tpm(exp_count, gene_len_sub)
##(4) count 2 FPKM
exp_fpkm = counts_to_fpkm(exp_count, gene_len_sub)
##(5) fpkm 2 tpm
exp_tpm2 = as.data.frame(apply(exp_fpkm, 2, fpkm_to_tpm))
|