1
2
3
4
5
6
7
8
import dgl
import dgl.nn  as dglnn

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

0、预测任务与数据

预测论文属于哪一种类别,即为多分类问题

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
data = dgl.data.CoraGraphDataset()
g = data[0]

graph = g
# 节点ID
node_id = g.nodes()   
# 节点特征
feature_name = "feat"
# 节点标签
label_name = "label"
#拆分训练集与测试集
train_mask = torch.ones(len(g.nodes()), dtype=torch.bool)
train_mask[np.random.permutation(g.nodes())[:int(len(g.nodes())*0.7)]] = False
test_mask = ~ train_mask

1、全局训练模型

所谓全局训练模型,就是最常规的训练方式。

一轮epoch里,使用训练集优化模型,使用测试集评价模型

1.1 定义GNN模型框架

  • 示例化模型时需要提供每层的参数结构
  • 模型计算时需要根据前向函数提供输入数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class GNN_model(nn.Module):
    #定义GNN层以及相关参数
    def __init__(self, in_feats, hid_feats, out_feats, dropout):
        super().__init__()
        self.SAGE1 = dglnn.SAGEConv(in_feats=in_feats,
                                    out_feats=hid_feats,
                                    aggregator_type="mean")
        self.SAGE2 = dglnn.SAGEConv(in_feats=hid_feats,
                                    out_feats=out_feats,
                                    aggregator_type="mean")
        self.dropout = nn.Dropout(dropout)
    #定义前向传播函数
    def forward(self, graph, inputs):
        h = F.relu(self.SAGE1(graph, inputs))
        h = self.dropout(h)
        h = self.SAGE2(graph, h)
        return h

# model = GNN_model(1433, 128, 7, 0.1)

## 自定义函数计算分类精确度
def evaluate(predict, label):
    pred_label = predict.argmax(1)
    correct = torch.sum(pred_label == label)
    acc = correct.item()*1.0 / len(label)
    return acc

1.2 定义训练模型流程

主涉及设置优化器,模型性能评,训练早停等

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def train(model, graph, feature_name, label_name, train_mask, test_mask,
          num_epochs, learning_rate, weight_decay, patience, verbose=True):
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    val_loss_best = 100000
    trigger_times = -1
    node_features = graph.ndata[feature_name]
    node_labels = graph.ndata[label_name]
    
    for epoch in range(num_epochs):
        model.train()
        predicts = model(graph, node_features)
        loss = F.cross_entropy(predicts[train_mask], node_labels[train_mask])

        train_acc = evaluate(predicts[train_mask], node_labels[train_mask])
        test_acc = evaluate(predicts[test_mask], node_labels[test_mask])
        test_loss= F.cross_entropy(predicts[test_mask], node_labels[test_mask])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose :
            print("Epoch {:03d} | Loss {:.4f} | Train Acc {:.4f} | Test Loss {:.4f} | Test Acc {:.4f}".format(
                                                epoch, loss.item(),train_acc,test_loss.item(),test_acc))
        
        if test_loss.item() > val_loss_best:
            trigger_times += 1
            if trigger_times >= patience:
                break
        else:
            trigger_times = 0
            val_loss_best = test_loss.item()
            
    return loss.item(), val_loss_best, train_acc, test_acc #依次返回训练集损失、测试集损失、训练精度、测试精度

1.3 训练一次模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
#模型结构参数
in_feats, hid_feats, out_feats, dropout = 1433, 128, 7, 0.1
#模型训练参数
num_epochs, learning_rate, weight_decay, patience = 100, 0.01, 1e-4, 5

#实例化模型
model = GNN_model(in_feats, hid_feats, out_feats, dropout)
#优化、训练模型
metrics = train(model, graph, feature_name, label_name, train_mask, test_mask,
                num_epochs, learning_rate, weight_decay, patience, verbose=False)

metrics
# (0.004630064591765404, 0.6909242868423462, 1.0, 0.78)

1.4 K折交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 获得每一折的训练集与验证集掩码
def get_k_fold_data(k, j, node_id, shuffle=True):
    assert k > 1 
    fold_size = len(node_id) // k
    idx = slice(j*fold_size, (j+1)*fold_size)
    np.random.seed(42)
    if shuffle :
        node_id2 = np.random.permutation(node_id)
    else :
        node_id2 = node_id
    train_mask = torch.ones(len(node_id2) , dtype=torch.bool)
    train_mask[node_id2[idx]] = False
    test_mask = ~ train_mask
    return train_mask, test_mask

# K折训练pipeline,返回每一折的训练结果
def k_fold(k, in_feats, hid_feats, out_feats, dropout, 
           graph, feature_name, label_name, 
           num_epochs, learning_rate, weight_decay, patience):
    k_fold_metrics = []
    node_id = graph.nodes()
    for j in range(k):
        print(f'Fold-{j+1}')
        train_mask, test_mask = get_k_fold_data(k, j, node_id)
        model = GNN_model(in_feats, hid_feats, out_feats, dropout)
        metrics = train(model, graph, feature_name, label_name, train_mask, test_mask,
                        num_epochs, learning_rate, weight_decay, patience, verbose=False)
        k_fold_metrics.append(metrics)
    return k_fold_metrics

# 实际训练
k = 10 
## 其余参数同上
k_fold_metrics = k_fold(k, in_feats, hid_feats, out_feats, dropout, 
           graph, feature_name, label_name,
           num_epochs, learning_rate, weight_decay, patience)
np.array(k_fold_metrics).mean(0)

2、小批量训练模型

在mini-batch模式中的一轮epoch,在训练集中平均分为若干个batch,然后依次迭代训练;取最后一个batch的model作为该轮epoch的结果。

2.1 DGL节点采样函数

  • MultiLayerFullNeighborSampler 设计采样层数,每个节点采样全部邻居
  • NeighborSampler 每一层中每个节点采样固定样本数量的邻居
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
dataloader = dgl.dataloading.DataLoader(
    g, nid, sampler,
    batch_size=32,
    shuffle=True)

input_nodes, output_nodes, blocks = next(iter(dataloader))

# 小批量的32个节点
output_nodes
# 小批量相连两层邻居涉及的所有节点
input_nodes
# 聚合层情况,按顺序由外到内
print(blocks)

# sampler = dgl.dataloading.NeighborSampler([5, 10])

2.2 定义GNN模型框架

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
class GNN_model_batch(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, dropout):
        super().__init__()
        self.SAGE1 = dglnn.SAGEConv(in_feats=in_feats,
                                    out_feats=hid_feats,
                                    aggregator_type="mean")
        self.SAGE2 = dglnn.SAGEConv(in_feats=hid_feats,
                                    out_feats=out_feats,
                                    aggregator_type="mean")
        self.dropout = nn.Dropout(dropout)
    
    #与之前的区别主要体现在前向计算中,其中graph变成了blocks
    def forward(self, blocks, inputs):
        h = F.relu(self.SAGE1(blocks[0], inputs))
        h = self.dropout(h)
        h = self.SAGE2(blocks[1], h)
        return h

2.3 定义训练模型流程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def train_batch(model, graph, feature_name, label_name, train_mask, test_mask,
          		num_epochs, learning_rate, weight_decay, patience,batch_size, verbose=True):
    #与之前的区别之一是多了一个batch_size参数
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    val_loss_best = 100000
    trigger_times = -1
    node_features = graph.ndata[feature_name]
    node_labels = graph.ndata[label_name]
    node_ids = graph.nodes()
    
    # 根据后续需要,设置不同的节点采样器
    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
    train_dataloader = dgl.dataloading.DataLoader(
        graph, node_ids[train_mask], sampler,
        batch_size=batch_size,
        shuffle=True)
    all_train_dataloader = dgl.dataloading.DataLoader(
        graph, node_ids[train_mask], sampler,
        batch_size=len(node_ids[train_mask]),
        shuffle=False)
    all_test_dataloader = dgl.dataloading.DataLoader(
        graph, node_ids[test_mask], sampler,
        batch_size=len(node_ids[test_mask]),
        shuffle=False)
    
    for epoch in range(num_epochs):
        model.train()
        for it, (_, _, blocks) in enumerate(train_dataloader):
            x = blocks[0].srcdata[feature_name]    #第一层
            y = blocks[-1].dstdata[label_name]  #最后一层
            
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        _, _, blocks = next(iter(all_train_dataloader))
        x = blocks[0].srcdata[feature_name]    #第一层
        y = blocks[-1].dstdata[label_name]  #最后一层
        y_hat = model(blocks, x)
        epoch_loss = F.cross_entropy(y_hat, y)
        train_acc = evaluate(y_hat, y)
        
        _, _, blocks = next(iter(all_test_dataloader))
        x = blocks[0].srcdata[feature_name]    #第一层
        y = blocks[-1].dstdata[label_name]  #最后一层
        y_hat = model(blocks, x)
        test_loss = F.cross_entropy(y_hat, y)
        test_acc = evaluate(y_hat, y)
        
        if verbose:
            print("Epoch {:03d} | Loss {:.4f} | Train Acc {:.4f} | Test Loss {:.4f} | Test Acc {:.4f} ".format(
                                                epoch, loss.item(),train_acc,test_loss.item(),test_acc))
        if test_loss.item() > val_loss_best:
            trigger_times += 1
            if trigger_times >= patience:
                break
        else:
            trigger_times = 0
            val_loss_best = test_loss.item()
            
    return loss.item(), val_loss_best, train_acc, test_acc

2.4 训练一次模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
graph = g
feature_name = "feat"
label_name = "label"
train_mask = torch.ones(len(g.nodes()), dtype=torch.bool)
train_mask[np.random.permutation(g.nodes())[:int(len(g.nodes())*0.7)]] = False
test_mask = ~ train_mask

in_feats, hid_feats, out_feats, dropout = 1433, 128, 7, 0.1
num_epochs, learning_rate, weight_decay, patience = 100, 0.01, 1e-4, 5
batch_size = 32

model = GNN_model_batch(in_feats, hid_feats, out_feats, dropout)
metrics = train_batch(model, graph, feature_name, label_name, train_mask, test_mask,
                      num_epochs, learning_rate, weight_decay,  patience, batch_size, verbose=False)

2.5 K折交叉验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# 生成K折训练集/测试集掩码函数同上
def k_fold(k, in_feats, hid_feats, out_feats, dropout, 
           graph, feature_name, label_name, 
           num_epochs, learning_rate, weight_decay, patience, batch_size):
    k_fold_metrics = []
    node_id = graph.nodes()
    for j in range(k):
        print(f'Fold-{j+1}')
        train_mask, test_mask = get_k_fold_data(k, j, node_id)
        model = GNN_model_batch(in_feats, hid_feats, out_feats, dropout)
        metrics = train_batch(model, graph, feature_name, label_name, train_mask, test_mask,
                              num_epochs, learning_rate, weight_decay,  patience, batch_size, verbose=False)
        k_fold_metrics.append(metrics)
    return k_fold_metrics

k = 10
k_fold_metrics = k_fold(k, in_feats, hid_feats, out_feats, dropout, 
           graph, feature_name, label_name,
           num_epochs, learning_rate, weight_decay, patience, batch_size)
np.array(k_fold_metrics).mean(0)