1、计算公式

sample 1 sample 2 …….. sample k
Gene 1 10 12 30
Gene 2 20 25 60
……
Gene n 0 0 1

对于(n*k)表达矩阵中k个样本的n个基因的count表达数据。

在任意样本(列)中,基因i的长度为l,count表达值为q;则基因i的FPKM与TPM标准化方式计算如下

$$ \begin{align} & FPKM_i = \frac{\frac{q_i}{l_i/10^3}}{\sum_{j=1}^nq_j/10^6} = \frac{\frac{q_i}{l_i}}{{\sum_{j=1}^nq_j}}*10^9 \tag{1} \ \ & TPM_i = \frac{\frac{q_i}{l_i/10^3}}{\sum_{j=1}^n(\frac{q_j}{l_j/10^3})/10^6} = \frac{\frac{q_i}{l_i}}{\sum_{j=1}^n(\frac{q_j}{l_j})} * 10^6 \tag{2}

\end{align} $$

  • FPKM : fragments per kilobase of exon per million mapped fragments.
  • TPM:transcript per million, and the sum of all TPM values is the same in all samples.

2、基因长度

2.1 gtf注释外显子之和

  • 参考https://www.biostars.org/p/456800/,计算基因的所有外显子长度之和作为基因长度。
  • 基因以及对应外显子坐标信息来自基因组注释文件gtf,下载方式参考之前Refgenie笔记。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
library(GenomicFeatures) 
#如下选取的是hg19版本的注释基因组文件
txdb <- makeTxDbFromGFF("gencode.v32lift37.annotation.gtf",format="gtf")
exons_per_gene <- exonsBy(txdb,by="gene")
gene_len_from_exon <- as.data.frame(sum(width(reduce(exons_per_gene))))
colnames(gene_len_from_exon)[1] = "length"
gene_len_from_exon$ENSEMBL = str_split(rownames(gene_len_from_exon),"[.]",
                                        simplify = T)[,1]
head(gene_len_from_exon)
#                      length         ENSEMBL
# ENSG00000000003.15_3   4536 ENSG00000000003
# ENSG00000000005.6_3    1476 ENSG00000000005
# ENSG00000000419.12_4   1207 ENSG00000000419

##接下来将基因ENSEMBL名转为Symbol格式
gene_ids<-AnnotationDbi::select(org.Hs.eg.db, keys=gene_len_from_exon$ENSEMBL, 
                                columns="SYMBOL", #目标格式
                                keytype="ENSEMBL") #目前的格式
gene_ids = gene_ids %>% 
  dplyr::distinct(ENSEMBL, .keep_all = T) %>% 
  na.omit()
gene_len = dplyr::left_join(gene_len_from_exon, gene_ids) %>% 
  dplyr::distinct(ENSEMBL, .keep_all = T) %>% 
  dplyr::distinct(SYMBOL, .keep_all = T)
dim(gene_len)
# [1] 34102     3
head(gene_len)
#   length         ENSEMBL   SYMBOL
# 1   4536 ENSG00000000003   TSPAN6
# 2   1476 ENSG00000000005     TNMD
# 3   1207 ENSG00000000419     DPM1

2.2 TCGAbiolinks包内置的基因长度信息

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
##(1) hg19
geneInfo_hg19 = TCGAbiolinks::geneInfo %>% na.omit() %>% 
  as.data.frame()
head(geneInfo_hg19)

##(1) hg38
geneInfo_hg38 = TCGAbiolinks::geneInfoHT %>% 
  tibble::rownames_to_column("ENSEMBL")
library(org.Hs.eg.db)
gene_ids = AnnotationDbi::select(org.Hs.eg.db, keys=geneInfo_hg38$ENSEMBL, 
                                columns=c("SYMBOL"), #目标格式
                                keytype="ENSEMBL") #目前的格式
gene_ids = gene_ids %>% 
  distinct(ENSEMBL, .keep_all = T) %>% 
  distinct(SYMBOL, .keep_all = T) %>% 
  na.omit()
head(gene_ids)
geneInfo_hg38 = dplyr::inner_join(geneInfo_hg38, gene_ids) %>% 
  tibble::column_to_rownames("SYMBOL")
head(geneInfo_hg38)

3、定义函数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
##(1) counts_to_tpm
counts_to_tpm <- function(counts, geneLength) {
  # Process one column at a time.
  tpm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
    # log的减法相当于除法:每一列(单个样本的count)/对应基因的长度
    rate = log(counts[,i]) - log(geneLength) 
    #上一步更新后的新的细胞文库大小
    denom = log(sum(exp(rate))) 
    #按照细胞文库标准化
    exp(rate - denom + log(1e6)) 
  }))
  colnames(tpm) <- colnames(counts)
  rownames(tpm) <- rownames(counts)
  return(tpm)
}

##(2) counts_to_fpkm
counts_to_fpkm <- function(counts, geneLength) {
  fpkm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
    N <- sum(counts[,i])
    exp(log(counts[,i]) - log(geneLength) - log(N) + log(1e9))
  }))
  colnames(fpkm) <- colnames(counts)
  rownames(fpkm) <- rownames(counts)
  return(fpkm)
}

##(3) fpkm_to_Tpm
fpkm_to_tpm <- function(fpkm) {
  exp(log(fpkm) - log(colsum(fpkm)) + log(1e6))
}

4、示例分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
##(1)基因表达矩阵
exp_count = fread("GEO/GSE81861_CRC_NM_all_cells_COUNT.csv.gz",data.table = F) %>%
  dplyr::mutate(gene=str_split(V1,"_",simplify = T)[,2]) %>% 
  dplyr::distinct(gene, .keep_all = T) %>%
  tibble::column_to_rownames("gene") %>% 
  dplyr::select(!V1) %>% 
  dplyr::filter(rownames(.) %in% gene_len$SYMBOL)
dim(exp_count)
# [1] 26258   266
exp_count[1:4,1:4]
#        RHC3934__Bcell__.7DEA7B RHC3944__Bcell__.7DEA7B RHC3962__Tcell__.C6E879 RHC4003__Bcell__.7DEA7B
# TSPAN6                       0                       0                       0                       0
# TNMD                         0                       0                       0                       0
# DPM1                         0                       0                       0                       0
# SCYL3                        0                       0                       0                       1


##(2)基因长度
gene_len_sub = gene_len[match(rownames(exp_count),gene_len$SYMBOL), ]
gene_len_sub = data.frame(length=gene_len_sub$length,
                          row.names = gene_len_sub$SYMBOL)
head(gene_len_sub)
#          length
# TSPAN6     4536
# TNMD       1476
# DPM1       1207
identical(rownames(exp_count), rownames(gene_len_sub))
#[1] TRUE

##(3) count 2 TPM
exp_tpm = counts_to_tpm(exp_count, gene_len_sub)

##(4) count 2 FPKM
exp_fpkm = counts_to_fpkm(exp_count, gene_len_sub)

##(5) fpkm 2 tpm
exp_tpm2 = as.data.frame(apply(exp_fpkm, 2, fpkm_to_tpm))