边的预测问题即预测两个节点间是否可能存在边,可以视为二分类问题。

在训练时,首先同样要先更新节点信息,然后计算边的特征—

  • 将图中已知存在的边作为阳性边,标签为1
  • 随机抽取图中不存在边的两节点组成的边作为阴性边,标签为0

0、预测数据与任务

假设100个药物两两之间已知存在1000个相互作用,药物节点有50个特征。

目的是预测两药物之间可能存在相互作用的概率是多少

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
eids_raw = [i for i in itertools.combinations(range(0,100), 2)]
eids_sp1k = sample(eids_raw, 1000)
src = np.array(eids_sp1k)[:,0]
dst = np.array(eids_sp1k)[:,1]
g_drug = dgl.graph((src, dst), num_nodes=100)
g_drug.ndata['feature'] = torch.randn(100, 50)  # 节点特征

#添加双向边:前1k与后1k一一对应
g_drug_bi = dgl.add_reverse_edges(g_drug)
g_drug_bi.edata["label"] = torch.concat((g_drug.edata["label"], 
                                         g_drug.edata["label"]))
print(g_drug_bi.num_nodes())
# 100
print(g_drug_bi.num_edges())
# 2000

1、全局训练模型

1.1 定义GNN模型框架

基本与边回归相同,模型分为两部分:第一部分根据图关系更新节点特征,第二部分根据组成节点计算边的特征

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class GNN_MLP_model(nn.Module):
    #定义GNN层以及相关参数
    def __init__(self, 
                 in_feats, hid_feats_1, hid_feats_2, 
                 mlp_hid_feats_1, mlp_out_feats,
                 dropout):
        super().__init__()
        #GNN更新节点特征
        self.SAGE1 = dglnn.SAGEConv(in_feats=in_feats,
                                    out_feats=hid_feats_1,
                                    aggregator_type="mean")
        self.SAGE2 = dglnn.SAGEConv(in_feats=hid_feats_1,
                                    out_feats=hid_feats_2,
                                    aggregator_type="mean")
        #MLP更新边的特征
        self.MLP1 = nn.Linear(hid_feats_2*2, mlp_hid_feats_1)
        self.MLP2 = nn.Linear(mlp_hid_feats_1, mlp_out_feats)
        
        self.dropout = nn.Dropout(dropout)
    
    def apply_edges(self, edges):
        h_u,  h_v = edges.src['h'], edges.dst['h']
        h_concat = torch.cat([h_u, h_v], 1)
        h2 = F.relu(self.MLP1(h_concat))
        h2 = self.dropout(h2)
        #与边回归的主要区别之一:本质上是二分类问题
        h2 = torch.sigmoid(self.MLP2(h2))
        return {'score':h2}
         
    #定义前向传播函数:需要输入图结构、节点特征
    def forward(self, graph, inputs, edge_graph):
        # GNN部分
        h = F.relu(self.SAGE1(graph, inputs))
        h = self.dropout(h)
        h = F.relu(self.SAGE2(graph, h))
        # MLP部分
        with edge_graph.local_scope():
            edge_graph.ndata['h'] = h
            edge_graph.apply_edges(self.apply_edges)
            return edge_graph.edata['score']
  • 定义拆分训练集与测试集子图的函数
    • 阳性边更新节点训练集
    • 阳性边训练集、阳性边测试集
    • 阴性边训练接、阴性边测试集
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import scipy.sparse as sp
import numpy as np
def produce_5_graph(graph, train_mask, test_mask):
    u, v = graph.edges()
    eids = np.arange(graph.number_of_edges())
    
    #用于更新节点的图(排除测试集的边)
    train_g = dgl.remove_edges(graph, eids[test_mask])
    
    #阳性边的训练集/测试集
    train_pos_g = dgl.graph((u[eids[train_mask]], v[eids[train_mask]]), 
                        num_nodes=graph.number_of_nodes())
    test_pos_g = dgl.graph((u[eids[test_mask]], v[eids[test_mask]]), 
                            num_nodes=graph.number_of_nodes())
    
    #阴性边的训练集/测试集
    adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
    adj_neg = 1 - adj.todense() - np.eye(graph.number_of_nodes())
    neg_u, neg_v = np.where(adj_neg != 0)
    
    neg_eids = np.random.choice(len(neg_u), graph.number_of_edges(), replace=False)
    train_neg_g = dgl.graph((neg_u[neg_eids[train_mask]], neg_v[neg_eids[train_mask]]), 
                            num_nodes=graph.number_of_nodes())
    test_neg_g = dgl.graph((neg_u[neg_eids[test_mask]], neg_v[neg_eids[test_mask]]), 
                            num_nodes=graph.number_of_nodes())
    
    return train_g, train_pos_g, test_pos_g, train_neg_g, test_neg_g

1.2 定义训练模型流程

  • 首先定义一个评价模型的函数,返回loss与auc
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
from sklearn.metrics import roc_auc_score
def loss_evaluate(model, train_g, pos_g, neg_g, mode="train"):
    if mode=="train":
        model.train()
    else:
        model.eval()
    pos_score = model(train_g, train_g.ndata[feature_name], pos_g)
    neg_score = model(train_g, train_g.ndata[feature_name], neg_g)
    scores = torch.cat([pos_score, neg_score]).reshape((-1))
    labels = torch.cat([torch.ones(pos_score.shape[0]),  
                        torch.zeros(neg_score.shape[0])]) 
    loss = F.binary_cross_entropy(scores, labels)
    auc = roc_auc_score(labels.detach().numpy(), scores.detach().numpy())
    return loss, auc
  • 训练模型流程
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def train(model, graph, 
          feature_name, train_mask, test_mask,
          num_epochs, learning_rate, weight_decay, patience, verbose=True):
    train_g, train_pos_g, test_pos_g, train_neg_g, test_neg_g = produce_5_graph(graph, train_mask, test_mask)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    val_loss_best = 100000
    trigger_times = -1
    
    for epoch in range(num_epochs):
        loss, auc = loss_evaluate(model, train_g, train_pos_g, train_neg_g)
        test_loss,test_auc =  loss_evaluate(model, train_g, test_pos_g, test_neg_g)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if verbose :
            print("Epoch {:03d} | Loss {:.4f} | Auc {:.4f} | Test Loss {:.4f} | Test Auc {:.4f} ".format(
                                  epoch, loss.item(), auc, test_loss.item(), test_auc))

        if test_loss.item() > val_loss_best:
            trigger_times += 1
            if trigger_times >= patience:
                break
        else:
            trigger_times = 0
            val_loss_best = test_loss.item()
    return loss.item(), auc, test_loss.item(), test_auc

1.3 训练一次模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
graph = g_drug_bi
feature_name = "feature"
# 按同一种边划分出训练集与测试集
train_mask = torch.zeros(int(g_drug_bi.num_edges()/2), dtype=torch.bool).bernoulli(0.7).tile((2,))
test_mask =  ~ train_mask

in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout = 50, 32, 16, 16, 1, 0.1
num_epochs, learning_rate, weight_decay, patience = 100, 0.01, 1e-4, 5

#实例化模型
model = GNN_MLP_model(in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout)
#训练模型
metrics = train(model, graph, 
                feature_name, train_mask, test_mask,
                num_epochs, learning_rate, weight_decay, patience, verbose=True)

1.4 K折交叉验证

  • 生成第i/k折的训练集与测试集掩码
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def get_k_fold_data(k, j, edge_id, shuffle=True):
    assert k > 1
    train_mask = torch.ones(int(len(edge_id)/2) , dtype=torch.bool)
    np.random.seed(42)
    if shuffle:
        edge_id2 = np.random.permutation(range(len(train_mask)))
    else :
        edge_id2 = train_mask
        
    fold_size = len(train_mask) // k
    idx = slice(j*fold_size, (j+1)*fold_size)
    train_mask[edge_id2[idx]] = False
    
    train_mask = train_mask.tile((2,))
    test_mask = ~ train_mask
    return train_mask, test_mask
  • k折训练流程
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def k_fold(k, model, graph,
           feature_name, train_mask, test_mask,
           num_epochs, learning_rate, weight_decay, patience):
    k_fold_metrics = []  #收集每一折的结果
    edge_id = graph.edges(form="all")[2]
    for j in range(k):
        print(f'Fold-{j+1}')
        train_mask, test_mask = get_k_fold_data(k, j, edge_id)
        model = GNN_MLP_model(in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout)
        metrics = train(model, graph, 
                        feature_name, train_mask, test_mask,
                        num_epochs, learning_rate, weight_decay, patience, verbose=False)
        
        k_fold_metrics.append(metrics)
    return k_fold_metrics

k = 10
k_fold_metrics = k_fold(k, model, graph, 
                        feature_name, train_mask, test_mask,
                        num_epochs, learning_rate, weight_decay, patience)
np.array(k_fold_metrics).mean(0)

2、小批量训练模型

2.1 DGL阳性/阴性边采样器

与边回归任务的边采样器大致相同,多了一个采集阴性边的过程。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
eids = np.arange(graph.number_of_edges())
train_eid = graph.edges(form="all")[2][train_mask]

sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
sampler = dgl.dataloading.as_edge_prediction_sampler(
    sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(1))
dataloader = dgl.dataloading.DataLoader(
    g, train_eids, sampler,
    batch_size=16,
    shuffle=True)

input_nodes, positive_graph, negative_graph, blocks = next(iter(dataloader))
positive_graph.num_edges(),negative_graph.num_edges()
# (16, 16)

2.2 定义GNN模型框架

在前向传播过程有所区别

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class GNN_MLP_model_batch(nn.Module):
    #定义GNN层以及相关参数
    def __init__(self, 
                 in_feats, hid_feats_1, hid_feats_2, 
                 mlp_hid_feats_1, mlp_out_feats,
                 dropout):
        super().__init__()
        self.SAGE1 = dglnn.SAGEConv(in_feats=in_feats,
                                    out_feats=hid_feats_1,
                                    aggregator_type="mean")
        self.SAGE2 = dglnn.SAGEConv(in_feats=hid_feats_1,
                                    out_feats=hid_feats_2,
                                    aggregator_type="mean")
        
        self.MLP1 = nn.Linear(hid_feats_2*2, mlp_hid_feats_1)
        self.MLP2 = nn.Linear(mlp_hid_feats_1, mlp_out_feats)
        
        self.dropout = nn.Dropout(dropout)
    
    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        h_concat = torch.cat([h_u, h_v], 1)
        
        h2 = F.relu(self.MLP1(h_concat))
        h2 = self.dropout(h2)
        h2 = torch.sigmoid(self.MLP2(h2))
        return {'score':h2}
        
    #定义前向传播函数
    ## blocks用于更新节点
    ## edge_subgraph用于计算目标边的特征
    def forward(self, blocks, inputs, edge_subgraph):
        # GNN部分
        h = F.relu(self.SAGE1(blocks[0], inputs))
        h = self.dropout(h)
        h = F.relu(self.SAGE2(blocks[1], h))
        # MLP部分
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['h'] = h
            edge_subgraph.apply_edges(self.apply_edges)
            return edge_subgraph.edata['score']

2.3 定义训练模型流程

  • 计算损失函数与AUC指标
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
def loss_evaluate_batch(model, blocks, pos_g, neg_g, input_nodes, node_feats, mode="train"):
    if mode=="train":
        model.train()
    else:
        model.eval()
    pos_score = model(blocks, node_feats[input_nodes], pos_g)   
    neg_score = model(blocks, node_feats[input_nodes], neg_g)
    scores = torch.cat([pos_score, neg_score]).reshape((-1))
    labels = torch.cat([torch.ones(pos_score.shape[0]),  
                        torch.zeros(neg_score.shape[0])]) 
    loss = F.binary_cross_entropy(scores, labels)
    auc = roc_auc_score(labels.detach().numpy(), scores.detach().numpy())
    return loss, auc
  • 训练流程
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def train_batch(model, graph, 
                feature_name, train_mask, test_mask,
                num_epochs, learning_rate, weight_decay, patience, batch_size, verbose=True):
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    node_feats = graph.ndata[feature_name]
    val_loss_best = 100000
    trigger_times = -1
    
    train_eids = graph.edges(form="all")[2][train_mask]
    test_eids = graph.edges(form="all")[2][test_mask]
    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
    sampler = dgl.dataloading.as_edge_prediction_sampler(
        sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(1))
    train_dataloader = dgl.dataloading.DataLoader(
        graph, train_eids, sampler,
        batch_size=batch_size,
        shuffle=True)
    all_train_dataloader = dgl.dataloading.DataLoader(
        graph, train_eids, sampler,
        batch_size=len(train_eids),
        shuffle=False)
    all_test_dataloader = dgl.dataloading.DataLoader(
        graph, test_eids, sampler,
        batch_size=len(test_eids),
        shuffle=False)
    
    for epoch in range(num_epochs):
        for it, (input_nodes, positive_graph, negative_graph, blocks) in enumerate(train_dataloader):
            loss, auc = loss_evaluate_batch(model, blocks, positive_graph, negative_graph, input_nodes, node_feats, mode="train")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # 该epoch的训练集loss与auc
        input_nodes, positive_graph, negative_graph, blocks = next(iter(all_train_dataloader))
        loss, auc = loss_evaluate_batch(model, blocks, positive_graph, negative_graph, input_nodes, node_feats, mode="eval")
        # 该epoch的测试集loss与auc
        input_nodes, positive_graph, negative_graph, blocks = next(iter(all_test_dataloader))
        test_loss, test_auc = loss_evaluate_batch(model, blocks, positive_graph, negative_graph, input_nodes, node_feats, mode="eval")
        
        if verbose :
            print("Epoch {:03d} | Loss {:.4f} | Auc {:.4f} | Test Loss {:.4f} | Test Auc {:.4f} ".format(
                                  epoch, loss.item(), auc, test_loss.item(), test_auc))

        if test_loss.item() > val_loss_best:
            trigger_times += 1
            if trigger_times >= patience:
                break
        else:
            trigger_times = 0
            val_loss_best = test_loss.item()
    return loss.item(), auc, test_loss.item(), test_auc

2.4 训练一次模型

1
2
3
4
5
6
# 其余参数同1.3
batch_size = 64
model = GNN_MLP_model_batch(in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout)
metrics = train_batch(model, graph, 
                      feature_name, train_mask, test_mask,
                      num_epochs, learning_rate, weight_decay, patience, batch_size, verbose=True)

2.5 K折交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 生成第i/k折训练集与测试集掩码的函数同1.4

def k_fold(k, in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout,
           graph, feature_name, 
           num_epochs, learning_rate, weight_decay, patience, batch_size):
    k_fold_metrics = []
    edge_id = graph.edges(form="all")[2]
    for j in range(k):
        print(f'Fold-{j+1}')
        train_mask, test_mask = get_k_fold_data(k, j, edge_id)
        model = GNN_MLP_model_batch(in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout)
        metrics = train_batch(model, graph, 
                feature_name, train_mask, test_mask,
                num_epochs, learning_rate, weight_decay, patience, batch_size, verbose=True)
        
        k_fold_metrics.append(metrics)
    return k_fold_metrics

k = 10
batch_size = 64
k_fold_metrics = k_fold(k, in_feats, hid_feats_1, hid_feats_2, mlp_hid_feats_1, mlp_out_feats, dropout,
                        graph, feature_name, 
                        num_epochs, learning_rate, weight_decay, patience, batch_size)