scanpy是单细胞数据分析的python工具包,将数据以Anndata对象的格式进行储存。如下将学习Anndata对象操作以及scanpy分析的基础用法。

1
2
3
import numpy as np
import pandas as pd
import anndata as ad

1、数据导入

  • (1)10X的三件套数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
# tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz


adata = sc.read_10x_mtx(
    'filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
    var_names='gene_symbols')           # use gene symbols for the variable names (variables-axis index)
adata
# AnnData object with n_obs × n_vars = 2700 × 32738
#     var: 'gene_ids'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
## 同样是上述示例数据,先用R包Seurat处理,储存结果
library(Seurat)
pbmc.data = Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
pbmc = CreateSeuratObject(counts = pbmc.data, project = "pbmc3k")
pbmc = NormalizeData(pbmc) %>% 
  FindVariableFeatures()
pbmc = pbmc %>% 
  ScaleData() %>% 
  RunPCA() %>% 
  RunUMAP(dims = 1:30) %>% 
  FindNeighbors(dims = 1:30) %>%
  FindClusters(resolution = 0.1)
head(pbmc@meta.data)
saveRDS(pbmc, file = "pbmc.rds")
1
2
3
4
5
6
7
8
9
## 准备Python环境
import scanpy as sc
import numpy as np
import anndata2ri
# Activate the anndata2ri conversion between SingleCellExperiment and AnnData
anndata2ri.activate()
#Loading the rpy2 extension enables cell magic to be used
#This runs R code in jupyter notebook cells
%load_ext rpy2.ipython
1
2
3
4
5
## 读入对象
%%R
library(Seurat)
pbmc = readRDS("pbmc.rds")
pbmc
1
2
3
4
5
## 转换对象
%%R -o pbmc_sce
#convert the Seurat object to a SingleCellExperiment object
pbmc_sce <- as.SingleCellExperiment(pbmc)
pbmc_sce
1
2
3
4
5
6
## 查看转换后的anndata对象
pbmc_sce
# AnnData object with n_obs × n_vars = 2700 × 32738
#     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.1', 'seurat_clusters', 'ident'
#     obsm: 'X_pca', 'X_umap'
#     layers: 'logcounts'

2、Anndata结构

https://anndata-tutorials.readthedocs.io/en/latest/getting-started.html

以上述1.2的pbmc_sce对象为例学习anndata结构组成

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
adata = pbmc_sce

# (1) 矩阵
# !注意:行是细胞,列是基因。与Seurat读取结果相反!
adata.obs_names #行名(细胞名)
adata.var_names #列名(基因名)
## 原始信息以稀疏矩阵形式储存
adata.X
print(adata.X)
adata.X.todense()

adata.layers["log_transformed"] = np.log1p(adata.X)
adata.to_df(layer="log_transformed")
adata.layers["X"] = adata.X
adata.to_df(layer="X")


# (2) metadata,按pd.Dataframe格式储存
adata.obs  # 细胞的metadata(常用)
adata.var  # 基因的metadata

# (3) 降维数据:多是数组格式,按字典逻辑储存
adata.obsm
adata.obsm.keys
adata.obsm["X_umap"]

# (4) subset子集
adata[["Cell_1", "Cell_10"], ["Gene_5", "Gene_1900"]]
adata[adata.obs.cell_type == "B"]

3、scanpy基础分析

https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

以上述1.1的原始读取结果adata对象为例

  • (1)质控过滤
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

adata.var['mt'] = adata.var_names.str.startswith('MT-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], 
                           percent_top=None, log1p=False, inplace=True)
# sc.pl.violin(adata, 
# 			 ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
#              jitter=0.4, multi_panel=True)
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]
  • (2)标准化
1
2
3
4
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10)
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
  • (3)降维分群
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
## 高变基因
sc.pp.highly_variable_genes(adata)
adata.var

## PCA降维
sc.tl.pca(adata)
adata.obsm["X_pca"].shape 
sc.pl.pca(adata) # 默认按X_pca的降维结果可视化
sc.pl.pca(adata, color='CST3')

## UMAP降维
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.pl.umap(adata)

## 分群
sc.tl.leiden(adata, resolution=0.5,key_added="cluster")
adata.obs
sc.pl.umap(adata, color=['cluster'])
  • (4)保存
1
adata.write("your_data.h5ad")