• 学习Genecompass时,了解参考到可以基因调控网络信息(Gene pair),计算Gene的嵌入表示(Embedding)

  • 关于Genesim是NLP领域受欢迎的工具:https://github.com/jingcheng-du/Gene2vec

  • gensim.modelsgensim 库中提供的一组模型工具,主要用于不同类型的模型和算法。

    • Word2Vecgensim 中非常常用的模型之一
  • 如下是参考Gene2vec修改后的简化脚本

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Stage-1: Definition of loading embeddings
import numpy as np
from gensim.models.keyedvectors import KeyedVectors

#convert binary gene2vec to matrix txt
def load_embeddings(file_name):
    model = KeyedVectors.load(file_name)
    wordVector = model.wv
    vocabulary, wv = zip(*[[word, wordVector[word]] for word, vocab_obj in wordVector.vocab.items()])
    return np.asarray(wv), vocabulary

def outputTxt (embeddings_file):
    embeddings_file = embeddings_file  # gene2vec file address
    wv, vocabulary = load_embeddings(embeddings_file)
    index = 0
    matrix_txt_file = embeddings_file+".txt"  # gene2vec matrix txt file address
    with open(matrix_txt_file, 'w') as out:
        for ele in wv[:]:
            out.write(str(vocabulary[index]) + "\t")
            index = index + 1
            for elee in ele:
                out.write(str(elee) + " ")
            out.write("\n")
    out.close()




# Stage-2: Gene2vec training
import gensim, logging
import os
import random
import datetime
import argparse


parser = argparse.ArgumentParser(description='Please specify data directory, embedding output directory and data file ending pattern')
parser.add_argument('fileAddress', metavar='N', type=str, nargs='+',
                    help='python gene2vec.py data_directory output_directory txt')

args = parser.parse_args()
sourceDir = args.fileAddress[0]  # source directory of the files
export_dir = args.fileAddress[1]
ending_pattern = args.fileAddress[2]
n_dims = args.fileAddress[3]


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print("start!")

# sourceDir = "../data"

# training file format:
#   TOX4 ZNF146
#   TP53BP2 USP12
#   TP53BP2 YRDC

num_db = 0
files = os.listdir(sourceDir)
size = len(files)
gene_pairs = list()
random.shuffle(files)

#load all the data
for fname in files:
    if not fname.endswith(ending_pattern):
        continue
    num_db = num_db + 1
    now = datetime.datetime.now()
    print(now)
    print("current file "+ fname + " num: " + str(num_db) + " total files " + str(size))
    f = open(os.path.join(sourceDir, fname), 'r', encoding='windows-1252')
    for line in f:
        gene_pair = line.strip().split()
        gene_pairs.append(gene_pair)
    f.close()

current_time = datetime.datetime.now()
print(current_time)
print("shuffle start " + str(len(gene_pairs)))
random.shuffle(gene_pairs)
current_time = datetime.datetime.now()
print(current_time)
print("shuffle done " + str(len(gene_pairs)))

####training parameters########
# dimension = 100  # dimension of the embedding
dimension = int(n_dims)  # dimension of the embedding
num_workers = 32  # number of worker threads
sg = 1  # sg =1, skip-gram, sg =0, CBOW
max_iter = 10  # number of iterations
window_size = 1  # The maximum distance between the gene and predicted gene within a gene list 
# (由于是基因对,所以设置为1)
txtOutput = True

# export_dir = "../emb"

for current_iter in range(1,max_iter+1):
    # 第一轮时,初始化模型,再训练
    if current_iter == 1:
        print("gene2vec dimension "+ str(dimension) +" iteration "+ str(current_iter)+ " start")
        model = gensim.models.Word2Vec(gene_pairs, size=dimension, window=window_size, min_count=1, workers=num_workers, iter=1, sg=sg)
        # 保存模型
        model.save(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
        if txtOutput:
            outputTxt(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
        print("gene2vec dimension "+ str(dimension) +" iteration "+ str(current_iter)+ " done")
        del model
    else:
        current_time = datetime.datetime.now()
        print(current_time)
        print("shuffle start " + str(len(gene_pairs)))
        # 打乱行数据
        random.shuffle(gene_pairs)
        current_time = datetime.datetime.now()
        print(current_time)
        print("shuffle done " + str(len(gene_pairs)))
        print("gene2vec dimension " + str(dimension) + " iteration " + str(current_iter) + " start")
        # 加载上一步模型
        model = gensim.models.Word2Vec.load(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter-1))
        model.train(gene_pairs,total_examples=model.corpus_count,epochs=model.iter)
        model.save(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
        if txtOutput:
            outputTxt(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
        print("gene2vec dimension " + str(dimension) + " iteration " + str(current_iter) + " done")
        del model
  • 示例运行代码,demo数据参考
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
python ./gene2vec.py data out txt 100

# 参数
## data: 输入基因对文件所在文件夹(支持多个文件)
## out:  输出文件夹
## txt: 输入基因对文件的后缀名
## 100: Embedding的嵌入维度

ls out
# gene2vec_dim_100_iter_1       gene2vec_dim_100_iter_3.txt  gene2vec_dim_100_iter_7
# gene2vec_dim_100_iter_10      gene2vec_dim_100_iter_4      gene2vec_dim_100_iter_7.txt
# gene2vec_dim_100_iter_10.txt  gene2vec_dim_100_iter_4.txt  gene2vec_dim_100_iter_8
# gene2vec_dim_100_iter_1.txt   gene2vec_dim_100_iter_5      gene2vec_dim_100_iter_8.txt
# gene2vec_dim_100_iter_2       gene2vec_dim_100_iter_5.txt  gene2vec_dim_100_iter_9
# gene2vec_dim_100_iter_2.txt   gene2vec_dim_100_iter_6      gene2vec_dim_100_iter_9.txt
# gene2vec_dim_100_iter_3       gene2vec_dim_100_iter_6.txt