1
2
3
4
5
6
packageVersion("Seurat")
# [1] ‘4.0.6’
library(Seurat)
library(tidyverse)
library(ggplot2)
library(clustree)

0、导入数据方式

(1)cellranger比对结果

对每个测序样本数据经cellranger上游比对,产生3个文件,分别是:

  • barcodes.tsv.gz – 细胞标签
  • features.tsv.gz – 基因名
  • matrix.mtx.gz – 表达数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
## GSE166635为例
dir="./data/HCC2/filtered_feature_bc_matrix/"
list.files(dir)
#[1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz" 

#(1) 先使用Read10X()对三文件整合为稀疏表达矩阵
counts <- Read10X(data.dir = dir)
#(2) 再使用CreateSeuratObject()创建Seurat对象
scRNA <- CreateSeuratObject(counts = counts)
scRNA

## 具体三个文件的格式:https://www.jianshu.com/p/5b26d7bc37b7

(2)直接提供表达矩阵

1
2
3
## GSE144320为例
scRNA <- CreateSeuratObject(counts = counts)
scRNA

(3)h5格式文件

1
2
3
4
5
## GSE138433为例
#(1) 读入表达矩阵
sce <- Read10X_h5(filename = "GSM4107899_LH16.3814_raw_gene_bc_matrices_h5.h5")
#(2) 转为Seurat对象
sce <- CreateSeuratObject(counts = sce)

(4)h5ad格式

  • 需要安装,使用SeuratDisk包的两个函数;
  • 先将后h5ad格式转换为h5seurat格式,再使用LoadH5Seurat()函数读取Seurat对象。
1
2
3
4
5
6
#remotes::install_github("mojaveazure/seurat-disk")
library(SeuratDisk)
Convert("GSE153643_RAW/GSM4648565_liver_raw_counts.h5ad", "h5seurat",
        overwrite = TRUE,assay = "RNA")
scRNA <- LoadH5Seurat("GSE153643_RAW/GSM4648565_liver_raw_counts.h5seurat")
#注意一下,我之前载入时,表达矩阵被转置了,需要处理一下~
  • 将Seurat对象转为h5ad格式
1
2
SaveH5Seurat(scRNA, filename = "scRNA.h5Seurat")
Convert("scRNA.h5Seurat", dest = "h5ad")
(5)10X PBMC demo
1
2
3
4
5
6
7
library(TENxPBMCData)
library(Seurat)
tenx_pbmc3k <- TENxPBMCData(dataset = "pbmc3k")
counts = as.matrix(assay(tenx_pbmc3k, "counts"))
rownames(counts) = rowData(tenx_pbmc3k)$Symbol_TENx
colnames(counts) = paste0("cell-",1:ncol(counts))
sce = CreateSeuratObject(counts = counts)

1、批量创建Seurat对象

(1)规范化10X文件样本名

每个样本一个文件夹,分别包含三个文件:barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
list.files("GSE139324_RAW/")
# [1] "GSM4138110_HNSCC_1_PBMC_barcodes.tsv.gz" "GSM4138110_HNSCC_1_PBMC_genes.tsv.gz"    "GSM4138110_HNSCC_1_PBMC_matrix.mtx.gz"  
# [4] "GSM4138112_HNSCC_2_PBMC_barcodes.tsv.gz" "GSM4138112_HNSCC_2_PBMC_genes.tsv.gz"    "GSM4138112_HNSCC_2_PBMC_matrix.mtx.gz"  
# [7] "GSM4138114_HNSCC_3_PBMC_barcodes.tsv.gz" "GSM4138114_HNSCC_3_PBMC_genes.tsv.gz"    "GSM4138114_HNSCC_3_PBMC_matrix.mtx.gz"

fs=list.files('./GSE139324_RAW/','^GSM')
samples=str_split(fs,'_',simplify = T)[,1] %>% unique()
samples
# [1] "GSM4138110" "GSM4138112" "GSM4138114"

dir.create("data")
for (sp in samples) {
  # sp = samples[1]
  sp3 = fs[grepl(sp, fs)] # 顺序很重要
  newfolder=paste0("data/", sp)
  dir.create(newfolder,recursive = T)
  file.copy(paste0("GSE139324_RAW/",sp3[1]),
            paste0(newfolder,"/barcodes.tsv.gz"))
  file.copy(paste0("GSE139324_RAW/",sp3[2]),
            paste0(newfolder,"/features.tsv.gz"))
  file.copy(paste0("GSE139324_RAW/",sp3[3]),
            paste0(newfolder,"/matrix.mtx.gz"))
}

image-20220313105611455

(2)合并多样本
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
list.files("data/", recursive = T)
samples = list.files("data/")
# [1] "GSM4138110" "GSM4138112" "GSM4138114"
dirs = paste0("data/",samples)
scelist = lapply(samples, function(sp){
  # sp = samples[1]
  print(sp)
  dirs = paste0("data/",sp)
  sce = Read10X(data.dir = dirs) %>%  #读取counts矩阵
    CreateSeuratObject(project = sp) %>% #创建seurat对象
    RenameCells(add.cell.id = sp) #添加前缀,避免重复
  return(sce)
})
sce = merge(scelist[[1]], scelist[-1])
head(sce@meta.data)
sce = sce %>% 
  PercentageFeatureSet(., "^MT-", col.name = "percent_mito") %>% #线粒体基因比例
  PercentageFeatureSet(., "^RP[SL]", col.name = "percent_ribo") %>% #核糖体基因比例
  PercentageFeatureSet(., "^HB[^(P)]", col.name = "percent_hb") #血红蛋白基因比例
head(sce@meta.data)
(3)过滤细胞/基因
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
dim(sce) 
#[1] 33694  4899
#过滤细胞
sce = sce %>% 
  subset(., nFeature_RNA > 500) %>% 
  subset(., percent_mito < 10) %>% 
  subset(., percent_ribo > 3) %>%
  subset(., percent_hb < 0.1) 
#过滤基因
sce = sce[rowSums(sce@assays$RNA@counts>0)>3,]
dim(sce)
#[1] 16334  4751
sce
# An object of class Seurat 
# 16334 features across 4751 samples within 1 assay 
# Active assay: RNA (16334 features, 0 variable features)

2、标归高降维

(1)标归高

标准化–归一化–鉴定高变基因

1
2
3
4
sce = sce %>%
  NormalizeData() %>%
  FindVariableFeatures(nfeatures = 2000) %>%
  ScaleData(., vars.to.regress = "percent_mito")
(2)降维聚类分群
1
2
3
4
5
6
7
8
sce = sce %>% 
  RunPCA() %>% 
  RunUMAP(dims = 1:30) %>%  #RunTSNE
  FindNeighbors(dims = 1:30) %>% 
  FindClusters(resolution = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1))
DimPlot(sce, reduction = "umap", group.by = "orig.ident")

# clustree(sce@meta.data, prefix = "RNA_snn_res.")
image-20220313211829987

3、Seurat结构

1
sce = sce_int
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#####表达矩阵#####
sce@assays
Assays(sce)
# [1] "RNA" "CCA"
DefaultAssay(sce)
DefaultAssay(sce) = "RNA"
DefaultAssay(sce)
sce@active.assay
GetAssayData(sce[["RNA"]], slot = "counts")




#####细胞meta信息#####
sce@meta.data %>% dim()
colnames(sce@meta.data)
# [1] "orig.ident"       "nCount_RNA"       "nFeature_RNA"     "percent_mito"     "percent_ribo"     "percent_hb"      
# [7] "RNA_snn_res.0.01" "RNA_snn_res.0.05" "RNA_snn_res.0.1"  "RNA_snn_res.0.2"  "RNA_snn_res.0.3"  "RNA_snn_res.0.5" 
# [13] "RNA_snn_res.0.8"  "RNA_snn_res.1"    "seurat_clusters"  "CCA_snn_res.0.01" "CCA_snn_res.0.05" "CCA_snn_res.0.1" 
# [19] "CCA_snn_res.0.2"  "CCA_snn_res.0.3"  "CCA_snn_res.0.5"  "CCA_snn_res.0.8"  "CCA_snn_res.1" 
head(sce[[]])
table(sce@active.ident)
sce = SetIdent(sce, value = "CCA_snn_res.0.1")
Idents(sce_int) <- "CCA_snn_res.0.05"

#####降维信息#####
names(sce@reductions)
#[1] "pca"  "umap"
DefaultDimReduc(sce)
sce@reductions$umap@cell.embeddings %>% head()
Embeddings(sce, reduction = "pca")[1:3,1:3]
sce[["pca"]][1:4,1:4]

#####FetchData取关于细胞的任何信息#####
FetchData(sce, vars = c("UMAP_1","UMAP_2")) %>% head()
FetchData(sce, vars = c("nCount_RNA","nFeature_RNA")) %>% head()
DefaultAssay(sce) = "RNA"
FetchData(sce, vars = c("CD4","CD8A","CD8B"), slot = "data") %>% head()

4、Seurat可视化

1
2
DefaultAssay(sce)="RNA"
Idents(sce) <- "CCA_snn_res.0.05"
(1)VlnPlot
1
2
3
4
# cell info
VlnPlot(sce, group.by = "orig.ident", 
        features = c("percent_mito", "nFeature_RNA"), 
        pt.size = 0, ncol = 2)
image-20220313213356312
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
# gene exp
VlnPlot(sce,
        features = c("CD4","CD8A","CD8B"),
        assay = "RNA", slot = "data", 
        pt.size = 0, ncol = 1)
### 修改展示顺序
# levels(sce_int@active.ident)
# sce_int@active.ident = factor(sce_int@active.ident,
#                               levels = c(5,4,3,2,1,0))
### 山脊图
# RidgePlot(sce_int, features = c("CD4","CD8A","CD8B"),
#           assay = "RNA", slot = "data")
image-20220313213549564
1
2
3
4
5
6
library(patchwork)
VlnPlot(sce,split.by = 'orig.ident',
        features = c("CD4","CD8A","CD8B"),
        assay = "RNA", slot = "data", 
        pt.size = 0,combine = FALSE) %>% 
  wrap_plots(ncol = 1)
image-20220313213751855
1
2
3
4
Idents(sce) = "RNA_snn_res.0.01"
sce$Group = ifelse(sce$orig.ident=="GSM4138110","before","after") #二分组变量
VlnPlot(sce, split.by = "Group", split.plot = TRUE,
        features = c("percent_mito", "nFeature_RNA"))
image-20230311083028141
(2)DotPlot
1
2
3
4
5
6
DotPlot(sce, features = c("CD4","CD8A","CD8B"),
        assay = "RNA") +
  coord_flip()
### 修改颜色
# DotPlot(sce, features = c("CD4","CD8A","CD8B"),
#         assay = "RNA", cols = c("lightgrey","red"))
image-20220313220121404
1
2
3
4
genelist = list(set1=c("CD3D","CD3E","CD3G"),
                set2=c("CD4","CD8A","CD8B"))
DotPlot(sce, features = genelist,
        assay = "RNA", scale = FALSE)    
image-20220313215943287
(3)Dimplot
1
2
DimPlot(sce, reduction = "umap", 
        group.by = "CCA_snn_res.0.01",label = T)
image-20220313220251357
1
2
3
DimPlot(sce, reduction = "umap", 
        group.by = "CCA_snn_res.0.01",
        split.by = "orig.ident") &  NoAxes()
image-20220313220333500
(4)FeaturePlot
1
FeaturePlot(sce, features = c("CD4","CD8A","CD8B"), ncol = 3)
image-20220313220532593
1
2
3
FeaturePlot(sce, features = c("CD4","CD8A","CD8B"), 
            split.by = "orig.ident",  ncol = 3,
            cols = c("lightgrey","red"))

image-20220313220618580

5、注释细胞类型

  • marker基因:Dotplot
  • 假设根据CCA_snn_res.0.05分群结果,将6个cluster注释为4中细胞类型
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
Idents(sce) <- "CCA_snn_res.0.05"
table(sce@active.ident)
# 0    1    2    3    4    5 
# 1595 1486 1250  315   56   49

sce$celltype = dplyr::case_when(
  sce@active.ident %in% c(0,1) ~ "celltypeA",
  sce@active.ident %in% c(2) ~ "celltypeB",
  sce@active.ident %in% c(3,4) ~ "celltypeC",
  sce@active.ident %in% c(5) ~ "celltypeD")

table(sce$celltype)
# celltypeA celltypeB celltypeC celltypeD 
# 3081      1250       371        49

6、差异分析

1
2
3
DefaultAssay(sce)="RNA"
Idents(sce) <- "CCA_snn_res.0.05"
table(sce@active.ident)
(1)FindAllMarkers
1
2
3
4
5
6
7
8
diff_wilcox = FindAllMarkers(sce, assay = "RNA", slot = "data", 
                             only.pos = T) %>% 
  dplyr::group_by(cluster) %>%
  dplyr::arrange(desc(avg_log2FC), .by_group = T)
top_n <- diff_wilcox %>% 
  group_by(cluster) %>% 
  top_n(5, avg_log2FC)
DoHeatmap(sce, top_n$gene, size = 3) 
image-20220313221723752
1
2
3
4
mapal <- colorRampPalette(RColorBrewer::brewer.pal(11,"RdBu"))(256)
DoHeatmap(subset(sce, downsample = 100), top_n$gene, 
          angle = 90,size = 3) +
  scale_fill_gradientn(colours = rev(mapal)) & NoLegend()
image-20220313221934424
(2)FindMarkers
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# 0 vs the other cluster
diff_wilcox_cluster0 = FindMarkers(sce, ident.1 = 0,
                                   assay = "RNA", slot = "data") 
head(diff_wilcox_cluster0)

# 0 vs 1
diff_wilcox_cluster0 = FindMarkers(sce, ident.1 = 0, ident.2 = 1,
                                   assay = "RNA", slot = "data") 
VlnPlot(subset(sce, CCA_snn_res.0.05 %in% c(0,1)), features = c("RPL11","CD2"), 
        assay = "RNA", slot = "data", group.by = "CCA_snn_res.0.05",
        pt.size = 0)
image-20220313222944096
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# 某一类细胞(celltypeA)在两个样本(GSM4138110,GSM4138112)的差异基因
Idents(sce)="celltype"
markers <- FindMarkers(sce,  subset.ident = "celltypeA", group.by = "orig.ident",
                       ident.1 = "GSM4138110", ident.2 = "GSM4138112")
head(markers)
VlnPlot(subset(sce, celltype %in% c("celltypeA") &
               orig.ident %in% c("GSM4138110","GSM4138112")), 
        features = c("RPS26","IGHA1"), 
        assay = "RNA", slot = "data", group.by = "orig.ident",
        pt.size = 0)
image-20220313223137651
(3)FindConservedMarkers
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
Idents(sce)="celltype"
# 在每个样本中orig.ident, celltypeA与celltypeB相比均差异表达的基因
markers = FindConservedMarkers(sce, ident.1 = "celltypeA", ident.2 = "celltypeB", 
                               assay = "RNA", slot = "data",
                               grouping.var = "orig.ident")

# 在每个样本中orig.ident, celltypeA与其它类型细胞相比均差异表达的基因
markers = FindConservedMarkers(sce, ident.1 = "celltypeA", 
                               assay = "RNA", slot = "data",
                               grouping.var = "orig.ident")

7、多线程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
library(future)
#查看初始状态
plan()

#设置4个线程
plan("multiprocess", workers = 4)
options(future.globals.maxSize = 1024^4)
plan()

#恢复单线程
plan("sequential")
plan()

image-20220313224458262

8、基因集打分

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
## (1) Seurat包自带的`AddModuleScore()`打分函数,结果有正负值
sce_pw_score = AddModuleScore(sce, features=pw.list, seed=42)
head(sce_pw_score[[]])

## (2) AUCell包评价基因集活性,结果在0~1之间
BiocManager::install("AUCell")
library(AUCell)
# step1:
cells_rankings <- AUCell_buildRankings(sce@assays$RNA@data)
# step2:
cells_AUC <- AUCell_calcAUC(pw.list, cells_rankings, aucMaxRank=nrow(cells_rankings)*0.1, nCores=5)
# step3:
sce_pw_auc = getAUC(cells_AUC)