参考

1
2
3
4
5
6
7
8
import os
import pandas as pd
import scanpy as sc

import gseapy as gp
from gseapy import barplot, dotplot

import matplotlib.pyplot as plt

0. 基因集获取

  • Enrichr API supported organisms: Human, Mouse, Yeast, Fly, Fish, Worm
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
names = gp.get_library_name(organism =  "Human")
len(names)
# 217

[name for name in names if "GO" in name]
# ['GO_Biological_Process_2021',
#  'GO_Biological_Process_2023',
#  'GO_Biological_Process_2025',
#  'GO_Cellular_Component_2021',
#  'GO_Cellular_Component_2023',
#  'GO_Cellular_Component_2025',
#  'GO_Molecular_Function_2021',
#  'GO_Molecular_Function_2023',
#  'GO_Molecular_Function_2025',
#  'SynGO_2022',
#  'SynGO_2024']
[name for name in names if "KEGG" in name]
# ['KEGG_2013',
#  'KEGG_2015',
#  'KEGG_2016',
#  'KEGG_2019_Human',
#  'KEGG_2019_Mouse',
#  'KEGG_2021_Human']
[name for name in names if "MSigDB" in name]
# ['MSigDB_Computational', 'MSigDB_Hallmark_2020', 'MSigDB_Oncogenic_Signatures']


kegg = gp.get_library(name='KEGG_2021_Human', organism='Human')
type(kegg) # dict

kegg['AMPK signaling pathway'][:4]
# ['RAB2A', 'PPP2R1A', 'TSC2', 'TSC1']
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# 还支持对MSigDB基因集数据库的获取
from gseapy import Msigdb
msig = Msigdb()

# all available versions
msig.list_dbver() 

# all types of gene sets given a dbver
msig.list_category(dbver="2025.1.Hs")

# all gene sets of one type
gmt = msig.get_gmt(category='h.all', dbver="2025.1.Hs")

type(gmt)   # dict
gmt.keys()  # hallmark pathway names

gmt["HALLMARK_ADIPOGENESIS"] # gene list

1. ORA富集

1.1 分析

  • Online analysis
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
## 1) 两个最重要的参数 (后同)
# gene_list
gene_list="tests/data/gene_list.txt"         # file path, one gene per row
gene_list= ["RAB2A","PRKAB2","PRKAA1"]    # list of genes

# gene_sets (from enrichr API)
gene_sets='KEGG_2016'
gene_sets=['KEGG_2021_Human','GO_Biological_Process_2025']


gene_list = pd.read_csv("tests/data/gene_list.txt", header=None)[0].tolist()
gene_list[:4]
# ['IGKV4-1', 'CD55', 'IGKC', 'PPFIBP1']

enr = gp.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt",
                 gene_sets=['GO_Biological_Process_2025'],
                 organism='human',
                 outdir=None, 
                )


enr.results.head(5)
# Index(['Gene_set', 'Term', 'Overlap', 'P-value', 'Adjusted P-value',
#        'Old P-value', 'Old Adjusted P-value', 'Odds Ratio', 'Combined Score',
#        'Genes'],
#       dtype='object')


## 2) 设置结果导出路径
enr = gp.enrichr(gene_list=gene_list, 
                 gene_sets=['GO_Biological_Process_2025'],
                 organism='human',
                 outdir="enrichr_kegg", 
                )
os.listdir("enrichr_kegg")
# ['GO_Biological_Process_2025.human.enrichr.reports.txt',
#  'GO_Biological_Process_2025.human.enrichr.reports.pdf']



## 3) 自定义Background genes (P值会存在一定差异)
enr = gp.enrichr(gene_list=gene_list, 
                 gene_sets=['GO_Biological_Process_2025'],
                 organism='human',
                 background="tests/data/background.txt",  # 自定义背景基因 / or list object
                 outdir=None, 
                )
  • Offline analysis (本地提供通路基因集数据)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from gseapy.parser import read_gmt

gmt_dict = read_gmt("tests/data/c2.cp.kegg.v7.5.1.symbols.gmt")
gmt_dict.keys()

enr2 = gp.enrich(gene_list=gene_list, # or gene_list=glist
                 gene_sets=["tests/data/c2.cp.kegg.v7.5.1.symbols.gmt"], 
                 background=None, 
                 outdir=None,
                 verbose=True)
enr2.results.head(5)

1.2 可视化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# ofname 设置图片的导出路径,若为None则不保存。也支持后续使用plt语法进一步调整图片

ax = dotplot(enr.results,
              column="Adjusted P-value",
              x='Gene_set', # set x axis, so you could do a multi-sample/library comparsion
              size=5,
              cutoff=0.05, 
              top_term=10,
              figsize=(3, 5),
              title = "GO_BP",
              xticklabels_rot=45, # rotate xtick labels
              show_ring=True, # set to False to revmove outer ring
              marker='o',
              ofname='./plot/p1.png'   
             )


ax = barplot(enr.results,
              column="Adjusted P-value",
              group='Gene_set', # set group, so you could do a multi-sample/library comparsion
              size=10,
              top_term=10,
              figsize=(3,5),
              #color=['darkred', 'darkblue'] # set colors for group
              color = {'GO_Biological_Process_2025': 'salmon'},
              ofname='./plot/p2.png'   
             )
image-20250723083652809
1
2
3
ax = dotplot(enr.res2d, title='Enrichr',cmap='viridis_r', size=10,  figsize=(3,5), ofname='./plot/p3.png')

ax = barplot(enr.res2d, title='Enrichr',cmap='viridis_r', size=4, figsize=(4, 5), color='darkred', ofname='./plot/p4.png')  # 差别不大
image-20250723084005190

2. Prerank富集

基于差异分析结果的GSEA

2.1 分析

  • 输入的是降序排列基因
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# pd.DataFrame: index为基因, 列一般为差异倍数(降序)

rnk = pd.read_csv("./tests/data/temp.rnk", header=None, index_col=0, sep="\t")
type(rnk)
# pandas.core.frame.DataFrame
rnk.shape # (22922, 1)
rnk.head()
#                 1
# 0                
# ATXN1   16.456753
# UBQLN4  13.989493
# CALM1   13.745533
# DLG4    12.796588
# MRE11A  12.787631

pre_res = gp.prerank(rnk=rnk, # or "./tests/data/temp.rnk"
                     gene_sets=['GO_Biological_Process_2025'],
                     threads=4,
                     min_size=5,
                     max_size=1000,
                     permutation_num=1000, # reduce number to speed up testing
                     outdir=None, # don't write to disk
                     seed=6,
                     verbose=True, # see what's going on behind the scenes
                    )


pre_res.res2d.head(5)
# Index(['Name', 'Term', 'ES', 'NES', 'NOM p-val', 'FDR q-val', 'FWER p-val',
#        'Tag %', 'Gene %', 'Lead_genes'],
#       dtype='object')

2.2 可视化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
terms = pre_res.res2d.Term  # Series
terms[:3]
# 0    GO_Biological_Process_2025__Primary miRNA Proc...
# 1    GO_Biological_Process_2025__Cytoplasmic Transl...
# 2    GO_Biological_Process_2025__Regulation of Extr...


axs = pre_res.plot(terms=terms[1], figsize=(4, 6), ofname="./plot/p5.png")


axs = pre_res.plot(terms=terms[1:5],
                   #legend_kws={'loc': (1.2, 0)}, # set the legend loc
                   show_ranking=True, # whether to show the second yaxis
                   figsize=(3,4),
                   ofname="./plot/p6.pdf"
                  )
image-20250723085011851
1
2
3
4
5
6
7
ax = dotplot(pre_res.res2d,
             column="FDR q-val",
             title='Prerank dotplot',
             cmap=plt.cm.viridis,
             size=6, # adjust dot size
             figsize=(4,5), cutoff=0.25, show_ring=False,
             ofname="./plot/p7.pdf")
image-20250723085109849

3. GSEA富集

基于分组表达矩阵的GSEA分析 (经典)

3.1 分析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
adata = sc.read_h5ad("tests/data/ifnb.h5ad") 
adata.obs.head()

adata.layers['counts'] = adata.X
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.obs.groupby('seurat_annotations')['stim'].value_counts()

# set STIM as class 0, CTRL as class 1, to make categorical
adata.obs['stim'] = pd.Categorical(adata.obs['stim'], categories=["STIM", "CTRL"], ordered=True)
# 排序
indices = adata.obs.sort_values(['seurat_annotations', 'stim']).index
adata = adata[indices,:]
# 取其中一种细胞类型的两个分组
bdata = adata[adata.obs.seurat_annotations == "CD14 Mono"].copy()
bdata
# AnnData object with n_obs × n_vars = 4362 × 14053
#     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'stim', 'seurat_annotations'
#     var: 'features'
#     uns: 'log1p'
#     layers: 'counts'


res = gp.gsea(data=bdata.to_df().T, # row -> genes, column-> samples
        gene_sets="GO_Biological_Process_2025",
        cls=bdata.obs.stim,
        permutation_num=1000,
        permutation_type='phenotype',
        outdir=None,
        method='s2n', # signal_to_noise
        threads= 16)

res.res2d.head(10)
# Index(['Name', 'Term', 'ES', 'NES', 'NOM p-val', 'FDR q-val', 'FWER p-val',
#        'Tag %', 'Gene %', 'Lead_genes'],
#       dtype='object')

3.2 可视化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
i = 7
genes = res.res2d.Lead_genes.iloc[i].split(";")
ax = gp.heatmap(df = res.heatmat.loc[genes],
           z_score=None,
           title=res.res2d.Term.iloc[i],
           figsize=(6,5),
           cmap=plt.cm.viridis,
           xticklabels=False,
           ofname="./plot/p8.png"
           )


term = res.res2d.Term
# gp.gseaplot(res.ranking, term=term[i], **res.results[term[i]])
axs = res.plot(terms=term[:5], ofname="./plot/p9.png")
image-20250723085607612

4. ssGSEA富集

基于表达矩阵分析每个样本

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
exp_dat = pd.read_csv("tests/extdata/Leukemia_hgu95av2.trim.txt", sep="\t", index_col=0)
exp_dat = exp_dat.drop(columns=["NAME"])
exp_dat.iloc[:4,:4]
#           ALL_1   ALL_2  ALL_3   ALL_4
# Gene                                  
# MAPK3    1633.6  2455.0  866.0  1000.0
# TIE1      284.4   159.0  173.0   216.0
# CYP2C19   285.8   114.0  429.0   -43.0
# CXCR5    -126.6  -388.0  143.0  -915.0

import gseapy as gp
# txt, gct file input
ss = gp.ssgsea(#data='./tests/extdata/Leukemia_hgu95av2.trim.txt',
               data=exp_dat,
               gene_sets='./tests/extdata/h.all.v7.0.symbols.gmt',
               outdir=None,
               sample_norm_method='rank', # choose 'custom' will only use the raw value of `data`
               no_plot=True)


ss.res2d.head()
#      Name                     Term           ES       NES
# 0   ALL_2  HALLMARK_MYC_TARGETS_V1  3393.823575  0.707975
# 1  ALL_12  HALLMARK_MYC_TARGETS_V1  3385.626111  0.706265
# 2  AML_11  HALLMARK_MYC_TARGETS_V1  3359.186716  0.700749


nes = ss.res2d.pivot(index='Term', columns='Name', values='NES')
nes.iloc[:,:3].head()
# Name                             ALL_1    ALL_10    ALL_11
# Term                                                      
# HALLMARK_ADIPOGENESIS         0.287384  0.274548  0.290059
# HALLMARK_ALLOGRAFT_REJECTION   0.06177  0.028062  0.096589
# HALLMARK_ANDROGEN_RESPONSE    0.133453  0.113911  0.193074