RNA velocity(RNA速率)分析是基于单细胞转录组测序数据分析细胞发育状态动力学的方法。简单来说,该模型根据测序read片段属于unspliced pre-mrna 以及spliced mature mrna 的比例推测单细胞的发育轨迹。 高比例的unspliced pre-mrna 的占比越高,velocity速率越大,表明在之后阶段中将产生高表达趋势。
如下将主要学习分析流程,算法原理可参考原始论文:
分析流程参考教程:
上游比对:https://velocyto.org/velocyto.py/index.html
下游分析:https://smorabit.github.io/tutorials/8_velocyto/
1、上游比对#
单细胞测序技术有很多,如下以常用的10X genomics的测序结果为例。
首先根据原始测序结果fastq.gz,经10X官方的cellranger软件进行比对(使用方法参考之前笔记)
根据filtered_feature_bc_matrix
文件夹结果使用Seurat包进行单细胞常规分析;
根据比对的bam文件,使用velocyto软件分别识别出比对到unspliced与spliced mRNA。
示例数据是GSE178911中的GSM5400792样本,为人体急性髓系白血病(AML)骨髓细胞。已经过cellranger比对,结果保存在201008_D0_scRNA_fastq
文件夹。
1
2
3
4
5
6
7
8
9
10
11
12
13
tree -L 1 ./201008_D0_scRNA_fastq/outs/
# ./201008_D0_scRNA_fastq/outs/
# ├── cellsorted_possorted_genome_bam.bam
# ├── filtered_feature_bc_matrix
# ├── filtered_feature_bc_matrix.h5
# ├── filtered_feature_bc_matrix.tar.gz
# ├── metrics_summary.csv
# ├── molecule_info.h5
# ├── possorted_genome_bam.bam
# ├── possorted_genome_bam.bam.bai
# ├── raw_feature_bc_matrix
# ├── raw_feature_bc_matrix.h5
# └── web_summary.html
copy
1.1 velocyto分析#
1
2
3
4
5
6
7
8
9
10
# (1) 参考下面链接,建立软件环境
# https://velocyto.org/velocyto.py/install/index.html
# (2) 准备三份数据
#
cellranger_outDir=/path/to/201008_D8Ven_scRNA_fastq
#
cellranger_gtf=/path/to/refdata-gex-GRCh38-2020-A/genes/genes.gtf
#
rmsk_gtf=/path/to/hg38_repeat_rmsk.gtf
copy
rmsk_gtf下载链接:https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=611454127_NtvlaW6xBSIRYJEBI0iRDEWisITa&clade=mammal&org=Human&db=0&hgta_group=allTracks&hgta_track=rmsk&hgta_table=rmsk&hgta_regionType=genome&position=&hgta_outputType=gff&hgta_outFileName=mm10_rmsk.gtf
1
2
3
4
5
6
7
# (3) 开始分析
velocyto run10x -m $rmsk_gtf $cellranger_outDir $cellranger_gtf
# (4) 分析结果
tree ./201008_D0_scRNA_fastq/velocyto/
# ./201008_D0_scRNA_fastq/velocyto/
# └── 201008_D0_scRNA_fastq.loom
copy
如上,每个样本都会产生一个loom文件结果。
1.2 Seurat分析#
使用Seurat包完成基本分析,主要包括UMAP降维与分群两步
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
library(Seurat)
library(tidyverse)
counts = Read10X("filtered_feature_bc_matrix/" )
sce = CreateSeuratObject(counts)
dim(sce)
# [1] 36601 6005
sce = sce %>%
NormalizeData() %>%
FindVariableFeatures(nfeatures = 2000 ) %>%
ScaleData()
sce = sce %>%
RunPCA() %>%
RunUMAP(dims = 1 :30 ) %>% #RunTSNE
FindNeighbors(dims = 1 :30 ) %>%
FindClusters(resolution = c(0.5 ))
table(sce$seurat_clusters)
# 0 1 2 3 4 5 6 7 8 9
# 1250 1077 827 621 599 541 397 386 215 92
DimPlot(sce)
saveRDS(sce, file = "sce.rds" )
copy
2、下游分析#
以下在jupyter notebook中执行
2.1 读入数据#
参考之前scanpy学习笔记,导入Seurat对象为Anndata
1
2
3
4
5
6
7
8
9
import scanpy as sc
import numpy as np
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython
copy
1
2
3
4
%%R
library(Seurat)
sce = readRDS("./201008_D0_scRNA_fastq/sce.rds" )
sce
copy
1
2
3
4
%%R -o adata
adata = as .SingleCellExperiment(sce)
adata
copy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import scvelo as scv
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
ldata1 = scv.read('./201008_D0_scRNA_fastq/velocyto/201008_D0_scRNA_fastq.loom' )
barcodes = [bc.split(':' )[1 ] for bc in ldata1.obs.index.tolist()]
barcodes = [bc[0 :len (bc)- 1 ] + '-1' for bc in barcodes]
barcodes
ldata1.obs.index = barcodes
ldata1.var_names_make_unique()
copy
1
2
3
4
5
6
7
8
9
adata = scv.utils.merge(adata, ldata1)
adata
sc.pl.umap(adata, color='seurat_clusters' ,
legend_loc='on data' )
copy
2.2 scvelo速率分析#
(1)统计每个cluster的spliced与unspliced比例
1
scv.pl.proportions(adata, groupby='seurat_clusters' )
copy
1
2
3
4
5
6
7
8
9
10
11
12
13
scv.pp.filter_and_normalize(adata)
scv.pp.moments(adata)
scv.tl.velocity(adata, mode='stochastic' )
scv.tl.velocity_graph(adata)
adata
copy
1
2
3
scv.pl.velocity_embedding_grid(adata, basis='umap' , color='seurat_clusters' ,
scale=0.25 )
copy
1
2
3
scv.pl.velocity_embedding_stream(adata, basis='umap' ,
color='seurat_clusters' )
copy
1
2
3
4
5
6
scv.tl.rank_velocity_genes(adata, groupby='seurat_clusters' , min_corr=.3 )
df = scv.DataFrame(adata.uns['rank_velocity_genes' ]['names' ])
df.head()
scv.pl.scatter(adata, df['2' ][:3 ], frameon=True ,
color='seurat_clusters' , size=10 , linewidth=1.5 )
copy
1
2
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime' , cmap='gnuplot' )
copy
1
2
3
scv.tl.paga(adata, groups='seurat_clusters' )
scv.pl.paga(adata, basis='umap' , size=50 , alpha=.1 ,
min_edge_width=2 , node_size_scale=1.5 )
copy
1
2
3
cur_celltypes = ["0" ]
adata_subset = adata[adata.obs['seurat_clusters' ].isin(cur_celltypes)]
sc.pl.umap(adata_subset)
copy
1
2
3
4
5
6
sc.pp.neighbors(adata_subset, n_neighbors=15 , use_rep='X_pca' )
scv.pp.filter_and_normalize(adata_subset)
scv.pp.moments(adata_subset)
scv.tl.recover_dynamics(adata_subset)
scv.pl.velocity_embedding_stream(adata_subset, basis='umap' )
copy
1
2
3
4
df = adata_subset.var
df = df[(df['fit_likelihood' ] > .1 ) & df['velocity_genes' ] == True ]
top_genes = adata_subset.var['fit_likelihood' ].sort_values(ascending=False ).index[:300 ]
scv.pl.scatter(adata_subset, basis=top_genes[:15 ], ncols=5 , frameon=True )
copy