参考
https://gseapy.readthedocs.io/en/latest/index.html 示例数据来自 https://github.com/zqfang/GSEApy/tree/master/tests/data 1 2 3 4 5 6 7 8 import os import pandas as pd import scanpy as sc import gseapy as gp from gseapy import barplot, dotplot import matplotlib.pyplot as plt 0. 基因集获取 Enrichr API supported organisms: Human, Mouse, Yeast, Fly, Fish, Worm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 names = gp.get_library_name(organism = "Human") len(names) # 217 [name for name in names if "GO" in name] # ['GO_Biological_Process_2021', # 'GO_Biological_Process_2023', # 'GO_Biological_Process_2025', # 'GO_Cellular_Component_2021', # 'GO_Cellular_Component_2023', # 'GO_Cellular_Component_2025', # 'GO_Molecular_Function_2021', # 'GO_Molecular_Function_2023', # 'GO_Molecular_Function_2025', # 'SynGO_2022', # 'SynGO_2024'] [name for name in names if "KEGG" in name] # ['KEGG_2013', # 'KEGG_2015', # 'KEGG_2016', # 'KEGG_2019_Human', # 'KEGG_2019_Mouse', # 'KEGG_2021_Human'] [name for name in names if "MSigDB" in name] # ['MSigDB_Computational', 'MSigDB_Hallmark_2020', 'MSigDB_Oncogenic_Signatures'] kegg = gp.get_library(name='KEGG_2021_Human', organism='Human') type(kegg) # dict kegg['AMPK signaling pathway'][:4] # ['RAB2A', 'PPP2R1A', 'TSC2', 'TSC1'] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # 还支持对MSigDB基因集数据库的获取 from gseapy import Msigdb msig = Msigdb() # all available versions msig.list_dbver() # all types of gene sets given a dbver msig.list_category(dbver="2025.1.Hs") # all gene sets of one type gmt = msig.get_gmt(category='h.all', dbver="2025.1.Hs") type(gmt) # dict gmt.keys() # hallmark pathway names gmt["HALLMARK_ADIPOGENESIS"] # gene list 1. ORA富集 1.1 分析 Online analysis 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 ## 1) 两个最重要的参数 (后同) # gene_list gene_list="tests/data/gene_list.txt" # file path, one gene per row gene_list= ["RAB2A","PRKAB2","PRKAA1"] # list of genes # gene_sets (from enrichr API) gene_sets='KEGG_2016' gene_sets=['KEGG_2021_Human','GO_Biological_Process_2025'] gene_list = pd.read_csv("tests/data/gene_list.txt", header=None)[0].tolist() gene_list[:4] # ['IGKV4-1', 'CD55', 'IGKC', 'PPFIBP1'] enr = gp.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt", gene_sets=['GO_Biological_Process_2025'], organism='human', outdir=None, ) enr.results.head(5) # Index(['Gene_set', 'Term', 'Overlap', 'P-value', 'Adjusted P-value', # 'Old P-value', 'Old Adjusted P-value', 'Odds Ratio', 'Combined Score', # 'Genes'], # dtype='object') ## 2) 设置结果导出路径 enr = gp.enrichr(gene_list=gene_list, gene_sets=['GO_Biological_Process_2025'], organism='human', outdir="enrichr_kegg", ) os.listdir("enrichr_kegg") # ['GO_Biological_Process_2025.human.enrichr.reports.txt', # 'GO_Biological_Process_2025.human.enrichr.reports.pdf'] ## 3) 自定义Background genes (P值会存在一定差异) enr = gp.enrichr(gene_list=gene_list, gene_sets=['GO_Biological_Process_2025'], organism='human', background="tests/data/background.txt", # 自定义背景基因 / or list object outdir=None, ) Offline analysis (本地提供通路基因集数据) 1 2 3 4 5 6 7 8 9 10 11 from gseapy.parser import read_gmt gmt_dict = read_gmt("tests/data/c2.cp.kegg.v7.5.1.symbols.gmt") gmt_dict.keys() enr2 = gp.enrich(gene_list=gene_list, # or gene_list=glist gene_sets=["tests/data/c2.cp.kegg.v7.5.1.symbols.gmt"], background=None, outdir=None, verbose=True) enr2.results.head(5) 1.2 可视化 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 # ofname 设置图片的导出路径,若为None则不保存。也支持后续使用plt语法进一步调整图片 ax = dotplot(enr.results, column="Adjusted P-value", x='Gene_set', # set x axis, so you could do a multi-sample/library comparsion size=5, cutoff=0.05, top_term=10, figsize=(3, 5), title = "GO_BP", xticklabels_rot=45, # rotate xtick labels show_ring=True, # set to False to revmove outer ring marker='o', ofname='./plot/p1.png' ) ax = barplot(enr.results, column="Adjusted P-value", group='Gene_set', # set group, so you could do a multi-sample/library comparsion size=10, top_term=10, figsize=(3,5), #color=['darkred', 'darkblue'] # set colors for group color = {'GO_Biological_Process_2025': 'salmon'}, ofname='./plot/p2.png' ) 1 2 3 ax = dotplot(enr.res2d, title='Enrichr',cmap='viridis_r', size=10, figsize=(3,5), ofname='./plot/p3.png') ax = barplot(enr.res2d, title='Enrichr',cmap='viridis_r', size=4, figsize=(4, 5), color='darkred', ofname='./plot/p4.png') # 差别不大 2. Prerank富集 基于差异分析结果的GSEA
...