1、不同基因ID转换

1.1 org.Hs.eg.db包

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
library(dplyr)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db) 
#  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
#  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GENETYPE"     "GO"          
# [13] "GOALL"        "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL" 
# [19] "PATH"         "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"      
# [25] "UCSCKG"       "UNIPROT"

gene_symbol=c("RHO","CALM1","MEG3","GNGT1","SAG","RPGRIP1","TRPM1","PCP2","PCP4","AP1B1")

gene_ids<-AnnotationDbi::select(org.Hs.eg.db, keys=as.character(gene_symbol), 
                                columns=c("ENSEMBL","ENTREZID"), #目标格式
                                keytype="SYMBOL") #目前的格式
gene_ids

##去重
gene_ids %>% 
  dplyr::distinct(ENTREZID, .keep_all = T)
#     SYMBOL         ENSEMBL ENTREZID
# 1      RHO ENSG00000163914     6010
# 2    CALM1 ENSG00000198668      801
# 3     MEG3 ENSG00000214548    55384
# 4    GNGT1 ENSG00000127928     2792
# 5      SAG ENSG00000130561     6295
# 6  RPGRIP1 ENSG00000092200    57096
# 7    TRPM1 ENSG00000134160     4308
# 8     PCP2 ENSG00000174788   126006
# 9     PCP4 ENSG00000183036     5121
# 10   AP1B1 ENSG00000100280      162

1.2 biomaRt包

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
library("biomaRt")
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl") 
attributes = listAttributes(ensembl)
attributes[1:5,]
# library(httr)
# httr::set_config(config(ssl_verifypeer = 0L))

gene_symbol=c("RHO","CALM1","MEG3","GNGT1","SAG","RPGRIP1","TRPM1","PCP2","PCP4","AP1B1")
gene_ids2 <- getBM(filters= "hgnc_symbol", 
                    attributes= c("hgnc_symbol","ensembl_gene_id","entrezgene_id"),
                    values = gene_symbol, mart= ensembl)
gene_ids2

2、鼠源基因转为人类基因ID

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
musGenes <- c("Hmmr", "Tlx3", "Cpeb4")

## 方式1:直接大小写转换
toupper(musGenes)
# [1] "HMMR"  "TLX3"  "CPEB4"

## 方式2:通过biomaRt包(不稳定)
require("biomaRt")
# library(httr)
# httr::set_config(config(ssl_verifypeer = 0L))
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl",host = "dec2021.archive.ensembl.org")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl",host = "dec2021.archive.ensembl.org")
genes = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", 
               values = musGenes, 
               mart = mouse, 
               attributesL = c("hgnc_symbol"), 
               martL = human, uniqueRows=T)

## 方式3:MGI 数据库
# https://support.bioconductor.org/p/129636/
library(dplyr)
mouse_human_genes = read.csv("http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt",sep="\t")

convert_mouse_to_human <- function(gene_list){
  
  output = c()
  
  for(gene in gene_list){
    class_key = (mouse_human_genes %>% filter(Symbol == gene & Common.Organism.Name=="mouse, laboratory"))[['DB.Class.Key']]
    if(!identical(class_key, integer(0)) ){
      human_genes = (mouse_human_genes %>% filter(DB.Class.Key == class_key & Common.Organism.Name=="human"))[,"Symbol"]
      for(human_gene in human_genes){
        output = append(output,human_gene)
      }
    }
  }
  
  return (output)
}

convert_mouse_to_human(musGenes)
# 1] "HMMR"  "TLX3"  "CPEB4"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# # https://github.com/lishensuo/utils
# 
# library("biomaRt")
# # library(httr)
# # httr::set_config(config(ssl_verifypeer = 0L))
# human = useMart("ensembl", dataset = "hsapiens_gene_ensembl",host = "dec2021.archive.ensembl.org")
# mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl",host = "dec2021.archive.ensembl.org")
# 
# # https://www.gencodegenes.org/mouse/
# dat = data.table::fread("gencode.vM33.basic.annotation.gtf.gz")
# dat = subset(dat, V3 == "gene")
# dat_sub = dat[,"V9"] %>% 
#   separate(V9, into = c("gene_id","gene_type","gene_name","mgi_id","havana_gene"), sep = "; ")
# dat_sub$gene_name2 = gsub('gencode.vM33.basic.annotation.gtf.gz "','',dat_sub$gene_name)
# dat_sub$gene_name2 = gsub('"','',dat_sub$gene_name2)
# 
# genes = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", 
#                values = dat_sub$gene_name2, 
#                mart = mouse, 
#                attributesL = c("hgnc_symbol"), 
#                martL = human, uniqueRows=T)
# write.csv(genes, file = "mgi2hgnc_biomart.csv", row.names = F, quote = F)
# head(genes)

3、蛋白质与基因ID转换

image-20220528214617188

4、化合物ID转换

4.1 网页转换

https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi

image-20220528214848722

4.2 python工具转换

(1)https://pubchempy.readthedocs.io/en/v1.0.4/index.html

1
pip install pubchempy

(2)简单用法如下

1
import pubchempy as pcp
  • ①可直接根据化合物CID,构建出pubchempy.Compound对象
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
c = pcp.Compound.from_cid(5090)
type(c)
# pubchempy.Compound

##举例3个属性值
c.molecular_formula
# 'C17H14O4S'
c.canonical_smiles
# 'CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3'
c.synonyms[:3]
# ['rofecoxib', '162011-90-7', 'Vioxx']


## 转换成表格,显示全部属性
c_df = pcp.compounds_to_frame(c)
c_df.columns
# Index(['atom_stereo_count', 'atoms', 'bond_stereo_count', 'bonds',
#        'cactvs_fingerprint', 'canonical_smiles', 'charge', 'complexity',
#        'conformer_id_3d', 'conformer_rmsd_3d', 'coordinate_type',
#        'covalent_unit_count', 'defined_atom_stereo_count',
#        'defined_bond_stereo_count', 'effective_rotor_count_3d', 'elements',
#        'exact_mass', 'feature_selfoverlap_3d', 'fingerprint',
#        'h_bond_acceptor_count', 'h_bond_donor_count', 'heavy_atom_count',
#        'inchi', 'inchikey', 'isomeric_smiles', 'isotope_atom_count',
#        'iupac_name', 'mmff94_energy_3d', 'mmff94_partial_charges_3d',
#        'molecular_formula', 'molecular_weight', 'monoisotopic_mass',
#        'multipoles_3d', 'pharmacophore_features_3d', 'record',
#        'rotatable_bond_count', 'shape_fingerprint_3d', 'shape_selfoverlap_3d',
#        'tpsa', 'undefined_atom_stereo_count', 'undefined_bond_stereo_count',
#        'volume_3d', 'xlogp'],
#       dtype='object')
  • ②根据小分子属性值,搜索符合条件的小分子对象

    仅支持6种属性:[“name”,“smiles”,“sdf”,“inchi”,“inchikey”,“formula”]

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
## 第一个参数--属性值,第二个参数--属性类型
## 返回结果是包含若干个小分子对象的list
results = pcp.get_compounds('Glucose', 'name')
results = pcp.get_compounds('C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1', 'smiles')
multi_results = pcp.get_compounds('C6H12O6', 'formula')
results = multi_results[:3]
for cp in results:
    print(cp.isomeric_smiles)
# C1[C@H]([C@H]([C@@H](C(O1)(CO)O)O)O)O
# C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)O
# C(C(C(C(C(C=O)O)O)O)O)O

cs = pcp.get_compounds('C20H41Br', 'formula')
cs[:3]
# [Compound(20271), Compound(23148745), Compound(10808570)]
cs_df = pcp.compounds_to_frame(cs, properties=['isomeric_smiles', 'xlogp', 'rotatable_bond_count'])
cs_df.shape
# (43, 3)