rdkit是一个化学信息学的python工具包,可用于计算化合物的结构相似性。
https://www.rdkit.org/docs/GettingStartedInPython.html#fingerprinting-and-molecular-similarity
1
|
conda install -c conda-forge rdkit
|
1
2
3
4
|
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
from rdkit.Chem import Draw
|
1、生成化合物结构指纹#
1.1 Topological Fingerprints#
1
2
3
4
5
6
7
8
|
m = Chem.MolFromSmiles('CCOC')
# Chem.MolToSmiles(mol)
fp = Chem.RDKFingerprint(m, fpSize=1024)
# fpSize 自定义数目,默认为2048
fp.GetNumBits()
# 1024
fp.ToBitString()
|
1.2 MACCS Fingerprints#
长度为167的分子指纹,每一位都表示一种特定的化学结构特征
https://github.com/openbabel/openbabel/blob/master/data/MACCS.txt
1
2
3
4
5
6
|
from rdkit.Chem import MACCSkeys
fp = MACCSkeys.GenMACCSKeys(m)
fp.GetNumBits()
# 167
fp.ToBitString()
|
1.3 Morgan/ECFP#
1
2
3
4
5
|
from rdkit.Chem import AllChem
fp = AllChem.GetMorganFingerprintAsBitVect(m1, radius = 6, nBits = 1024)
fp.GetNumBits()
# 1024
fp.ToBitString()
|
2、计算相似度#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
m = Chem.MolFromSmiles('CCOC')
fp = MACCSkeys.GenMACCSKeys(m)
m2 = Chem.MolFromSmiles('CCO')
fp2 = MACCSkeys.GenMACCSKeys(m2)
DataStructs.FingerprintSimilarity(fp, fp2)
# 0.5
#默认评价相似度指标为Tanimoto
DataStructs.FingerprintSimilarity(fp, fp2, metric=DataStructs.TanimotoSimilarity)
#还有其它指标
metic_list = ['DataStructs.TanimotoSimilarity',
'DataStructs.DiceSimilarity',
'DataStructs.CosineSimilarity',
'DataStructs.SokalSimilarity',
'DataStructs.RusselSimilarity',
'DataStructs.KulczynskiSimilarity',
'DataStructs.McConnaugheySimilarity']
for i in metic_list:
print(DataStructs.FingerprintSimilarity(fp, fp2, metric=eval(i)))
# 0.5
# 0.6666666666666666
# 0.6735753140545634
# 0.3333333333333333
# 0.041916167664670656
# 0.6805555555555556
# 0.3611111111111111
|
1
2
|
imgs = Draw.MolsToGridImage([m,m2],molsPerRow=2,subImgSize=(200,200), legends=["mol-1","mol-2"])
imgs
|