1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
# Stage-1: Definition of loading embeddings
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
#convert binary gene2vec to matrix txt
def load_embeddings(file_name):
model = KeyedVectors.load(file_name)
wordVector = model.wv
vocabulary, wv = zip(*[[word, wordVector[word]] for word, vocab_obj in wordVector.vocab.items()])
return np.asarray(wv), vocabulary
def outputTxt (embeddings_file):
embeddings_file = embeddings_file # gene2vec file address
wv, vocabulary = load_embeddings(embeddings_file)
index = 0
matrix_txt_file = embeddings_file+".txt" # gene2vec matrix txt file address
with open(matrix_txt_file, 'w') as out:
for ele in wv[:]:
out.write(str(vocabulary[index]) + "\t")
index = index + 1
for elee in ele:
out.write(str(elee) + " ")
out.write("\n")
out.close()
# Stage-2: Gene2vec training
import gensim, logging
import os
import random
import datetime
import argparse
parser = argparse.ArgumentParser(description='Please specify data directory, embedding output directory and data file ending pattern')
parser.add_argument('fileAddress', metavar='N', type=str, nargs='+',
help='python gene2vec.py data_directory output_directory txt')
args = parser.parse_args()
sourceDir = args.fileAddress[0] # source directory of the files
export_dir = args.fileAddress[1]
ending_pattern = args.fileAddress[2]
n_dims = args.fileAddress[3]
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
print("start!")
# sourceDir = "../data"
# training file format:
# TOX4 ZNF146
# TP53BP2 USP12
# TP53BP2 YRDC
num_db = 0
files = os.listdir(sourceDir)
size = len(files)
gene_pairs = list()
random.shuffle(files)
#load all the data
for fname in files:
if not fname.endswith(ending_pattern):
continue
num_db = num_db + 1
now = datetime.datetime.now()
print(now)
print("current file "+ fname + " num: " + str(num_db) + " total files " + str(size))
f = open(os.path.join(sourceDir, fname), 'r', encoding='windows-1252')
for line in f:
gene_pair = line.strip().split()
gene_pairs.append(gene_pair)
f.close()
current_time = datetime.datetime.now()
print(current_time)
print("shuffle start " + str(len(gene_pairs)))
random.shuffle(gene_pairs)
current_time = datetime.datetime.now()
print(current_time)
print("shuffle done " + str(len(gene_pairs)))
####training parameters########
# dimension = 100 # dimension of the embedding
dimension = int(n_dims) # dimension of the embedding
num_workers = 32 # number of worker threads
sg = 1 # sg =1, skip-gram, sg =0, CBOW
max_iter = 10 # number of iterations
window_size = 1 # The maximum distance between the gene and predicted gene within a gene list
# (由于是基因对,所以设置为1)
txtOutput = True
# export_dir = "../emb"
for current_iter in range(1,max_iter+1):
# 第一轮时,初始化模型,再训练
if current_iter == 1:
print("gene2vec dimension "+ str(dimension) +" iteration "+ str(current_iter)+ " start")
model = gensim.models.Word2Vec(gene_pairs, size=dimension, window=window_size, min_count=1, workers=num_workers, iter=1, sg=sg)
# 保存模型
model.save(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
if txtOutput:
outputTxt(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
print("gene2vec dimension "+ str(dimension) +" iteration "+ str(current_iter)+ " done")
del model
else:
current_time = datetime.datetime.now()
print(current_time)
print("shuffle start " + str(len(gene_pairs)))
# 打乱行数据
random.shuffle(gene_pairs)
current_time = datetime.datetime.now()
print(current_time)
print("shuffle done " + str(len(gene_pairs)))
print("gene2vec dimension " + str(dimension) + " iteration " + str(current_iter) + " start")
# 加载上一步模型
model = gensim.models.Word2Vec.load(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter-1))
model.train(gene_pairs,total_examples=model.corpus_count,epochs=model.iter)
model.save(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
if txtOutput:
outputTxt(export_dir+"/gene2vec_dim_"+str(dimension)+"_iter_"+str(current_iter))
print("gene2vec dimension " + str(dimension) + " iteration " + str(current_iter) + " done")
del model
|