Seurat V5版本有一段时间了,由于时间原因未来得及了解。现根据其官方文档简单整理其用法,与V4比较类似的地方就不多叙述了。此外,V5的亮点之一还在于单细胞多组学的整合分析,此次就不做记录了。(PS:中秋快乐~)

主要参考Seurat官方文档:

0. 安装

  • 首先是安装R语言环境(linux环境),使用conda命令
1
2
3
4
5
6
conda create -n r_env -y
conda activate r_env

conda install -c r r-base=4.2.2
conda install conda-forge::r-seurat
conda install r::r-tidyverse
  • 其次,安装其它推荐的工具包
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
# rcurl使用conda安装
setRepositories(ind = 1:3, addURLs = c('https://satijalab.r-universe.dev', 'https://bnprks.r-universe.dev/'))
install.packages(c("presto", "glmGamPoi"))

# conda install -c anaconda hdf5
install.packages(c("BPCells"))

# BiocManager::install("Rsamtools")
install.packages('Signac')
remotes::install_github("satijalab/seurat-data", quiet = TRUE) #提供示例数据集
# remotes::install_github("satijalab/azimuth", quiet = TRUE) #自动注细胞类型的R包,先不安装了
remotes::install_github("satijalab/seurat-wrappers", quiet = TRUE)
  • 分析前,加载相关初始包
1
2
3
4
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
  • 示例分析数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
library(SeuratData)
# AvailableData()
# InstallData("pbmc3k")
# InstallData("ifnb")

library(pbmc3k.SeuratData)
data('pbmc3k')
pbmc3k = UpdateSeuratObject(pbmc3k)
pbmc3k
pbmc = pbmc3k

library(ifnb.SeuratData)
data('ifnb')
ifnb = UpdateSeuratObject(ifnb)
ifnb

1. Layer结构

  • Seurat V4/3使用的还是v3版本的Seurat Object;
  • Seurat V5提出了新的v5版本的Seurat Object:
    • 主要区别之一是在Assays层级结构下,明确定义了Layer结构
    • 每个Seurat可以包含多个Assays,每个Assay可以包含多个Layers

1.1 创建v3/5对象

1
2
3
4
5
6
7
pbmc.counts <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
class(pbmc.counts)
# [1] "dgCMatrix"
# attr(,"package")
# [1] "Matrix"
dim(pbmc.counts)
# [1] 32738  2700
  • 直接创建Seurat Object
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
## v3
options(Seurat.object.assay.version = "v3")
pbmc <- CreateSeuratObject(counts = pbmc.counts)
class(pbmc[["RNA"]])
# [1] "Assay"
# attr(,"package")
# [1] "SeuratObject"

## v5
options(Seurat.object.assay.version = "v5")
pbmc <- CreateSeuratObject(counts = pbmc.counts)
class(pbmc[["RNA"]])
# [1] "Assay5"
# attr(,"package")
# [1] "SeuratObject"
  • 先创建Assay Object,再创建Seurat Object
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# create a v3 assay
assay.v3 <- CreateAssayObject(counts = pbmc.counts)
class(assay.v3)

# create a v5 assay
assay.v5 <- CreateAssay5Object(counts = pbmc.counts)
class(assay.v5)

# # create an assay using only normalized data
# assay.v5 <- CreateAssay5Object(data = log1p(pbmc.counts))

# create a Seurat object based on this assay
pbmc3k_slim <- CreateSeuratObject(assay.v5)
pbmc3k_slim

# 加入一个新的Assay(v3) 
pbmc3k_slim[['RNA3']] = as(object = pbmc3k_slim[["RNA"]], Class = "Assay5")

1.2 Layers相关操作

  • 查看
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
obj <- CreateSeuratObject(counts = pbmc.counts)
obj
# An object of class Seurat 
# 32738 features across 2700 samples within 1 assay 
# Active assay: RNA (32738 features, 0 variable features)
#  1 layer present: counts

DefaultAssay(obj)
# [1] "RNA"
DefaultLayer(obj[['RNA']])
# [1] "counts"

Layers(obj[["RNA"]])
# [1] "counts"

obj <- NormalizeData(obj, verbose = FALSE)
Layers(obj[["RNA"]])
# [1] "counts" "data"
  • 拆分
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
pbmc3k$batch <- sample(c("batchA", "batchB", "batchC"), ncol(pbmc3k), replace = TRUE)
# Split layers by batch
pbmc3k[["RNA"]] <- split(pbmc3k[["RNA"]], f = pbmc3k$batch)
Layers(pbmc3k[["RNA"]])
# [1] "data.batchC"   "data.batchA"   "data.batchB"   "counts.batchC"
# [5] "counts.batchA" "counts.batchB"

# Rejoin layers
pbmc3k[["RNA"]] <- JoinLayers(pbmc3k[["RNA"]])
Layers(pbmc3k[["RNA"]])
# [1] "data"   "counts"

# create a Seurat object initialized with multiple layers
batchA_counts <- pbmc.counts[, 1:200]
batchB_counts <- pbmc.counts[, 201:400]
batchC_counts <- pbmc.counts[, 401:600]
count_list <- list(batchA_counts, batchB_counts, batchC_counts)
names(count_list) <- c("batchA", "batchB", "batchC")

obj <- CreateSeuratObject(counts = count_list)
Layers(obj[["RNA"]])
# [1] "counts.batchA" "counts.batchB" "counts.batchC"

2. 常用操作

2.1 标准步骤

  • 逐步处理
1
2
3
4
5
6
7
8
pbmc <- NormalizeData(object = pbmc)
pbmc <- FindVariableFeatures(object = pbmc)
pbmc <- ScaleData(object = pbmc)
pbmc <- RunPCA(object = pbmc)
pbmc <- FindNeighbors(object = pbmc, dims = 1:30)
pbmc <- FindClusters(object = pbmc)
pbmc <- RunUMAP(object = pbmc, dims = 1:30)
DimPlot(object = pbmc, reduction = "umap")
  • SCT简化
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# https://github.com/satijalab/seurat/issues/8183
# https://blog.csdn.net/weixin_46128755/article/details/136395843
# matrixStats版本问题会导致SCTransform运行出错,这里选择安装1.1.0版本
pbmc <- SCTransform(object = pbmc)
pbmc <- RunPCA(object = pbmc)
pbmc <- FindNeighbors(object = pbmc, dims = 1:30)
pbmc <- FindClusters(object = pbmc)
pbmc <- RunUMAP(object = pbmc, dims = 1:30)
DimPlot(object = pbmc, reduction = "umap")

# 或者
pbmc <- SCTransform(pbmc) %>%
    RunPCA() %>%
    FindNeighbors(dims = 1:30) %>%
    FindClusters() %>%
    RunUMAP(dims = 1:30)

2.2 查看结构

  • 细胞/基因
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# cell names
colnames(pbmc) |> head()
Cells(pbmc) |> head()

# feature names
Features(pbmc) |> head()
rownames(pbmc) |> head()

# 高变基因
# Variable feature names
VariableFeatures(pbmc) |> str()
VariableFeatures(pbmc[["SCT"]]) |> str()
VariableFeatures(pbmc[["RNA"]]) |> str()
# VariableFeatures(pbmc, assay = 'SCT') 
var.gene.names = VariableFeatures(pbmc)[1:1000]
VariableFeatures(pbmc[["RNA"]]) = var.gene.names
  • meta.data
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
colnames(pbmc@meta.data)
# [1] "orig.ident"         "nCount_RNA"         "nFeature_RNA"      
# [4] "seurat_annotations" "nCount_SCT"         "nFeature_SCT"      
# [7] "SCT_snn_res.0.8"    "seurat_clusters"
Idents(object = pbmc) <- "seurat_annotations"
# View cell identities, get summary table
table(Idents(pbmc))
#  Naive CD4 T Memory CD4 T   CD14+ Mono            B        CD8 T FCGR3A+ Mono 
#          697          483          480          344          271          162 
#           NK           DC     Platelet 
#          155           32           14 

Idents(pbmc) <- "CD4 T cells"
table(Idents(pbmc))
# 逆操作
pbmc[["old.ident"]] <- Idents(object = pbmc)
colnames(pbmc@meta.data)

# Rename identity classes
pbmc <- RenameIdents(object = pbmc, `CD4 T cells` = "T Helper cells")
Idents(object = pbmc) |> table()
  • 矩阵信息
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
## Retrieve data in an expression matrix RNA counts matrix
pbmc[["RNA"]]$counts |> dim()
# [1] 13714  2700
pbmc[["SCT"]]$counts |> dim()
# [1] 12572  2700
LayerData(pbmc, assay = "RNA", layer = "counts") |> dim()
# [1] 13714  2700
# GetAssayData from Seurat v4 is still supported
GetAssayData(object = pbmc, assay = "RNA", slot = "counts") |> dim()

# Set expression data assume new.data is a new expression matrix
pbmc[["RNA"]]$counts <- new.data

## 降维信息
pbmc@reductions |> names()
# [1] "pca"  "umap"

pbmc[['pca']]@cell.embeddings |> head()
Embeddings(pbmc, reduction = "pca") |> head()

pbmc[['pca']]@feature.loadings |> dim()
Loadings(pbmc, reduction = "pca") |> dim()

2.3 增删操作

  • 提取特定feature
1
2
3
4
5
6
7
FetchData(object = pbmc, 
          vars = c("PC_1", "nFeature_RNA", "MS4A1"), 
          layer = "counts") |> head(3)
#                     PC_1 nFeature_RNA MS4A1
# AAACATACAACCAC 10.165883          779     0
# AAACATTGAGCTAC  5.812925         1352     4
# AAACATTGATCAGC  8.566106         1129     0
  • 筛选cells
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
Idents(object = pbmc) <- "seurat_annotations"

subset(x = pbmc, idents = "B") |> dim()
subset(x = pbmc, idents = "B", invert = TRUE) |> dim()

# Subset on the expression level of a gene/feature
subset(x = pbmc, subset = MS4A1 > 0.5) |> dim()

# Subset on a combination of criteria
subset(x = pbmc, subset = MS4A1 > 0.5 & PC_1 > 5) |> dim()
subset(x = pbmc, subset = MS4A1 > 0.5, idents = "B") |> dim()

# Subset on a value in the object meta data
subset(x = pbmc, subset = groups == "g1") |> dim()

# Downsample the number of cells per identity class
subset(x = pbmc, downsample = 100) |> dim()
  • 拆分与合并
    • 在1.2小结,了解到可以按Layer拆分,但仍属于同一个Object
    • 下面的命令可以拆分独立的多个Object
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
## 拆分
ifnb_list <- SplitObject(ifnb, split.by = "stim")
# [1] "list"
ifnb_list$CTRL
ifnb_list$STIM

## 合并
merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM)
merged_obj
# An object of class Seurat 
# 14053 features across 13999 samples within 1 assay 
# Active assay: RNA (14053 features, 0 variable features)
#  4 layers present: data.1, data.2, counts.1, counts.2

merged_obj[["RNA"]] <- JoinLayers(merged_obj[["RNA"]])
merged_obj
# An object of class Seurat 
# 14053 features across 13999 samples within 1 assay 
# Active assay: RNA (14053 features, 0 variable features)
#  2 layers present: data, counts

##### Merge objects (with integration):去批次效应
merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM)
merged_obj <- NormalizeData(merged_obj)
merged_obj <- FindVariableFeatures(merged_obj)
merged_obj <- ScaleData(merged_obj)
merged_obj <- RunPCA(merged_obj)
#去批次整合(上面的标准流都是对于每个Layer独立操作的)
merged_obj <- IntegrateLayers(object = merged_obj, method = RPCAIntegration, 
                              orig.reduction = "pca", new.reduction = "integrated.rpca",
                              verbose = FALSE)
merged_obj
# now that integration is complete, rejoin layers
merged_obj[["RNA"]] <- JoinLayers(merged_obj[["RNA"]])

Seurat V5版本共支持5种去批次效应的方法:CCAIntegration, RPCAIntegration, HarmonyIntegration, FastMNNIntegration, scVIIntegration,可参考:https://satijalab.org/seurat/articles/seurat5_integration

  • Pseudobulk计算
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
bulk <- AggregateExpression(ifnb, group.by = "seurat_annotations", return.seurat = TRUE)
Cells(bulk)
#  [1] "CD14 Mono"    "CD4 Naive T"  "CD4 Memory T" "CD16 Mono"    "B"           
#  [6] "CD8 T"        "T activated"  "NK"           "DC"           "B Activated" 
# [11] "Mk"           "pDC"          "Eryth"  
dim(bulk)
# [1] 14053    13

bulk <- AggregateExpression(ifnb, group.by = "seurat_annotations", return.seurat = FALSE)
class(bulk$RNA)
# [1] "dgCMatrix"
# attr(,"package")
# [1] "Matrix"


# pseudobulk cells by stimulation condition AND cell type
bulk <- AggregateExpression(ifnb, group.by = c("stim", "seurat_annotations"), return.seurat = TRUE)
Cells(bulk)

基于Pseudobulk结果,可采用DESeq2方法进行差异基因检验。可参考:https://satijalab.org/seurat/articles/de_vignette

2.4 可视化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
DimPlot(object = pbmc, reduction = "pca")

# Dimensional reduction plot, with cells colored by a quantitative feature Defaults to UMAP if available
FeaturePlot(object = pbmc, features = "MS4A1")

# Scatter plot across single cells
FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "PC_1")
FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "CD3D")

# Scatter plot across individual features, repleaces CellPlot
CellScatter(object = pbmc, cell1 = "AGTCTACTAGGGTG", cell2 = "CACAGATGGTTTCT")

VariableFeaturePlot(object = pbmc)

# Violin and Ridge plots
VlnPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32"))
RidgePlot(object = pbmc, feature = c("LYZ", "CCL5", "IL32"))


# Heatmaps (visualize scale.data slot)
DimHeatmap(object = pbmc, reduction = "pca", cells = 200)

# standard workflow
var.gene.names = VariableFeatures(pbmc)[1:50]
pbmc <- ScaleData(pbmc, features = var.gene.names)
# https://github.com/satijalab/seurat/issues/2722
DoHeatmap(object = pbmc, label = F)

# heatmap with maximum of 10 cells per group
DoHeatmap(pbmc, var.gene.names, cells = colnames(subset(pbmc, downsample = 10)), label = F)
  • 绘制Dotplot时,注意分组变量不能包含NA值
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
table(is.na(pbmc$seurat_annotations))
# FALSE  TRUE 
#  2638    62 
Idents(pbmc) <- as.character(Idents(pbmc))
pbmc$seurat_annotations_2 = pbmc$seurat_annotations
pbmc$seurat_annotations_2 = as.character(pbmc$seurat_annotations)
pbmc$seurat_annotations_2[is.na(pbmc$seurat_annotations_2)] = 'Other'

Idents(pbmc) = pbmc$seurat_annotations_2

VlnPlot(object = pbmc, features = "MS4A1", split.by = "groups")
DotPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32"), split.by = "groups")