Question 1. DeepSNAP异质图简介
表示异质图所需的图属性:
- node_feature: 节点特征The feature of each node (torch.tensor)
- edge_feature: 边特征The feautre of each edge (torch.tensor)
- node_label: 节点标签The label of each node (int)
- node_type: 节点类型The node type of each node (string)
- edge_type: 边类型The edge type of each edge (string)
在question 1部分,我们将使用图数据集karate club network作为示例。对该数据的介绍可参考我之前写的笔记:图数据集Zachary‘s karate club network详解,包括其在NetworkX、PyG上的获取和应用方式_诸神缄默不语的博客-CSDN博客
首先获取图数据,并按照其不同的类别(指所属club的不同)实现可视化:
from pylab import * import networkx as nx from networkx.algorithms.community import greedy_modularity_communities import matplotlib.pyplot as plt import copy G = nx.karate_club_graph() community_map = {} #key是节点索引,value是所属community的索引(0或1) for node in G.nodes(data=True): #node第一个元素是索引,第二个元素是相关数据,如在本例中就是{'club': 'Mr. Hi'} #默认data=False,就只输出索引 if node[1]["club"] == "Mr. Hi": community_map[node[0]] = 0 else: community_map[node[0]] = 1 node_color = [] color_map = {0: 0, 1: 1} node_color = [color_map[community_map[node]] for node in G.nodes()] pos = nx.spring_layout(G) #见下文介绍 plt.figure(figsize=(7, 7)) nx.draw(G, pos=pos, cmap=plt.get_cmap('coolwarm'), node_color=node_color) show()
关于 nx.spring_layout(G):这个是一个用来排布节点的函数,可以美化图可视化图像。
函数文档见:networkx.drawing.layout.spring_layout — NetworkX 2.6.2 documentation
大致功能是输入图数据等参数,返回以节点索引为key、节点对应的坐标为value的dict,dict元素示例:0: array([ 0.42143337, -0.10723518])
排布算法为Fruchterman-Reingold force-directed algorithm2,大致是模拟这样的逻辑:将边视为使所连接节点靠近的弹簧,而节点彼此之间有斥力,模拟演化到平衡状态时的布局。
这个的返回值可以置入 nx.draw() 的入参 pos 中,就让所绘制的图节点按这个字典的坐标来布局。
1.1 Question 1.1:分配Node Type and Node Features
用字典 community_map 和图 G 向 G 中增加 node_type 和 node_label 属性:对属于 “Mr. Hi” 俱乐部的节点赋 n0 为 node type、0 为 node label,对属于 “Officer” 俱乐部的节点赋 n1 为 node type、1为 node label。
给所有节点赋特征 [1, 1, 1, 1, 1]。
参考的NetworkX函数 nx.classes.function.set_node_attributes 文档:networkx.classes.function.set_node_attributes — NetworkX 2.6.2 documentation
函数使用示例:
G_eg = nx.path_graph(3) bb = nx.betweenness_centrality(G) #bb是一个字典 nx.set_node_attributes(G_eg, bb, "betweenness") G_eg.nodes[1]["betweenness"]
0.053936688311688304
问题答案代码:
import torch def assign_node_types(G, community_map): """ 输入NetworkX图G和community map(将节点映射到0/1标签的字典) 在G中增加node_type这一节点属性 """ new_cm={} for (k,v) in community_map.items(): if v==0: new_cm[k]='n0' else: new_cm[k]='n1' #我参考的答案里另一种比较优雅的写法: #node_type_map = {0:'n0', 1:'n1'} #node_types = {node:node_type_map[community_map[node]] for node in G.nodes()} nx.set_node_attributes(G,new_cm,'node_type') def assign_node_labels(G, community_map): """ 输入NetworkX图G和community map(将节点映射到0/1标签的字典) 在G中增加node_label这一节点属性 """ nx.set_node_attributes(G,community_map,'node_label') def assign_node_features(G): """ 输入NetworkX图G 在G中增加node_feature这一节点属性 """ feature_vector=[1, 1, 1, 1, 1] nx.set_node_attributes(G,feature_vector,'node_feature') assign_node_types(G, community_map) assign_node_labels(G, community_map) assign_node_features(G)
验证函数效果的代码:
for n in G.nodes(data=True): print(n) break
(0, {‘club’: ‘Mr. Hi’, ‘node_type’: ‘n0’, ‘node_label’: 0, ‘node_feature’: [1, 1, 1, 1, 1]})
1.2 Question 1.2:分配Edge Types
分配标准:
- Edges within club “Mr. Hi”: e0
- Edges within club “Officer”: e1
- Edges between clubs: e2
参考的NetworkX函数 nx.classes.function.set_edge_attributes 文档:networkx.classes.function.set_edge_attributes — NetworkX 2.6.2 documentation
问题答案代码:
def assign_edge_types(G, community_map): """ 输入NetworkX图G和community map(将节点映射到0/1标签的字典) 在G中增加edge_type这一边属性 """ #注:我觉得题目原来的意思是让用community_map赋值的,但用club属性应该也无所谓…… edge2attr_map={} for edge in G.edges(): if G.nodes[edge[0]]['club']=='Mr. Hi' and G.nodes[edge[1]]['club']=='Mr. Hi': edge2attr_map[edge]='e0' elif G.nodes[edge[0]]['club']=='Officer' and G.nodes[edge[1]]['club']=='Officer': edge2attr_map[edge]='e1' else: edge2attr_map[edge]='e2' nx.set_edge_attributes(G,edge2attr_map,'edge_type') assign_edge_types(G, community_map)
验证函数效果的代码:
#PRW for edge in G.edges(data=True): print(edge) break
(0, 1, {‘edge_type’: ‘e0’})
1.3 NetworkX异质图可视化
edge_color = {} for edge in G.edges(): n1, n2 = edge if community_map[n1] == community_map[n2] and community_map[n1] == 0: edge_color[edge] = 'blue' elif community_map[n1] == community_map[n2] and community_map[n1] == 1: edge_color[edge] = 'red' else: edge_color[edge] = 'green' G_orig = copy.deepcopy(G) nx.classes.function.set_edge_attributes(G, edge_color, name='color') colors = nx.get_edge_attributes(G,'color').values() labels = nx.get_node_attributes(G, 'node_type') plt.figure(figsize=(8, 8)) nx.draw(G, pos=pos, cmap=plt.get_cmap('coolwarm'), node_color=node_color, edge_color=colors, labels=labels, font_color='white') show()
1.4 将NetworkX异质图转换为DeepSNAP异质图
from deepsnap.hetero_graph import HeteroGraph hete = HeteroGraph(G_orig)
呃注意这部分代码有点难伺候,如果用 G 作为NetworkX backend,就会报 TypeError: Unknown type color in edge attributes. 这个错。
我看了一下对应的源代码:deepsnap.hetero_graph — DeepSNAP 0.2.0 documentation,就发现事情是这样的:
G_orig 的节点属性:
G_orig.nodes(data=True)[0]
输出:
{'club': 'Mr. Hi', 'node_type': 'n0', 'node_label': 0, 'node_feature': [1, 1, 1, 1, 1]}
G_orig 的边属性:
for e in G_orig.edges(data=True): print(e) break
输出:
(0, 1, {'edge_type': 'e0'})
G 的边属性:
for e in G.edges(data=True): print(e) break
输出:
(0, 1, {'edge_type': 'e0', 'color': 'blue'})
DeepSNAP中对应的代码:
def _get_edge_attributes(self, key: str): r""" Similar to the `_get_node_attributes` """ attributes = {} indices = None # TODO: suspect edge_to_tensor_mapping and edge_to_graph_mapping not useful if key == "edge_type": indices = {} for edge_idx, (head, tail, edge_dict) in enumerate( self.G.edges(data=True) ): if key in edge_dict: head_type = self.G.nodes[head]["node_type"] tail_type = self.G.nodes[tail]["node_type"] edge_type = self._get_edge_type(edge_dict) message_type = (head_type, edge_type, tail_type) if message_type not in attributes: attributes[message_type] = [] attributes[message_type].append(edge_dict[key]) if indices is not None: if message_type not in indices: indices[message_type] = [] indices[message_type].append(edge_idx) if len(attributes) == 0: return None for message_type, val in attributes.items(): if torch.is_tensor(attributes[message_type][0]): attributes[message_type] = torch.stack(val, dim=0) elif isinstance(attributes[message_type][0], float): attributes[message_type] = torch.tensor(val, dtype=torch.float) elif isinstance(attributes[message_type][0], int): attributes[message_type] = torch.tensor(val, dtype=torch.long) elif ( isinstance(attributes[message_type][0], str) and key == "edge_type" ): continue else: raise TypeError(f"Unknown type {key} in edge attributes.")
总之简单来说就是除了edge_type之外,边属性都不能是str格式。所以color这个属性就会报错。
但这样我们就很容易产生质疑,那节点属性里面的 club 又是怎么回事呢?然后我简单看了一下 _get_node_attributes() 这个函数,发现反正它没有边属性的那种限制……
我不确定是作者写这玩意时候没整明白,还是我妹整明白,我暂时也懒得问了。如果以后需要用DeepSNAP再去研究。
总之有这么个情况,在此说明。
可以打印出异质图的属性看一下:
for hetero_feature in hete: print(hetero_feature)
输出略
1.5 Question1.3:每一node type有多少个节点
hete的note_type属性是一个字典,key为node_type值(如 n0),如果key是str则value为类似这样的list:['n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0', 'n0'];如果key是int则value为Tensor。
def get_nodes_per_type(hete): num_nodes_n0=len(hete.node_type['n0']) num_nodes_n1=len(hete.node_type['n1']) return num_nodes_n0, num_nodes_n1 num_nodes_n0, num_nodes_n1 = get_nodes_per_type(hete) print("Node type n0 has {} nodes".format(num_nodes_n0)) print("Node type n1 has {} nodes".format(num_nodes_n1))
输出:
Node type n0 has 17 nodes Node type n1 has 17 nodes
1.6 Question 1.4:每一message type有多少条边
message type是node type和edge type的结合体。
hete.message_types
输出:
[('n0', 'e0', 'n0'), ('n0', 'e2', 'n1'), ('n1', 'e1', 'n1')]
edge_type是键为message_type值的字典,某一元素示例:
hete.edge_type[('n0', 'e0', 'n0')]
输出是一个元素全为 'e0' 的列表,具体略
问题答案代码:
def get_num_message_edges(hete): """ 返回一个列表,元素为tuple(message_type, num_edge) """ message_type_edges = [] for message_type,num_edge in hete.edge_type.items(): message_type_edges.append((message_type,len(num_edge))) return message_type_edges message_type_edges = get_num_message_edges(hete) for (message_type, num_edges) in message_type_edges: print("Message type {} has {} edges".format(message_type, num_edges))
输出:
Message type ('n0', 'e0', 'n0') has 35 edges Message type ('n0', 'e2', 'n1') has 11 edges Message type ('n1', 'e1', 'n1') has 32 edges
1.7 Question 1.5:数据集划分:每一个split中有多少个节点?
DeepSNAP有内置的数据集划分函数。
问题答案代码:
from deepsnap.dataset import GraphDataset def compute_dataset_split_counts(datasets): """ 入参:数据集划分后得到的字典(key为'train'/'val'/'test',value为对应的GraphSataset) 返回值:字典(key为'train'/'val'/'test',value为对应split中含有的有标签节点个数) """ data_set_splits = {} for ds_name,ds in datasets.items(): #print(ds_name) train #print(ds[0].node_label_index) {'n0': tensor([10, 8, 3, 12, 0, 13]), 'n1': tensor([ 0, 8, 1, 15, 5, 7])} data_set_splits[ds_name]=ds[0].node_label_index['n0'].shape[0]+ds[0].node_label_index['n1'].shape[0] #这里建议用的node_label_index,但是据我猜测用node_label应该也行 #对node_label_index属性的介绍见下 return data_set_splits dataset = GraphDataset([hete], task='node') # Splitting the dataset dataset_train, dataset_val, dataset_test = dataset.split(transductive=True, split_ratio=[0.4, 0.3, 0.3]) datasets = {'train': dataset_train, 'val': dataset_val, 'test': dataset_test} data_set_splits = compute_dataset_split_counts(datasets) for dataset_name, num_nodes in data_set_splits.items(): print("{} dataset has {} nodes".format(dataset_name, num_nodes))
输出:
train dataset has 12 nodes val dataset has 10 nodes test dataset has 12 nodes
HeteroGraph.node_label_index: Slicing node label to get the corresponding split G.node_label[G.node_label_index].(出自Introduction — DeepSNAP 0.2.0 documentation)
这写的是个什么玩意儿,这谁看得懂……总之意思就是说可以通过node_label_index来讲数据集划分后的节点通过索引对应到原来的标签,举例来说:
data_train=dataset_train[0] print(data_train.node_label) print(data_train.node_label_index) print(hete.node_label) print(hete.node_label_index) print(hete.node_label['n0'][data_train.node_label_index['n0']])
输出:
{'n0': tensor([0, 0, 0, 0, 0, 0]), 'n1': tensor([1, 1, 1, 1, 1, 1])} {'n0': tensor([ 5, 13, 14, 9, 0, 2]), 'n1': tensor([ 6, 11, 4, 13, 9, 15])} {'n0': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'n1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])} {'n0': tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 'n1': tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])} tensor([0, 0, 0, 0, 0, 0])
1.8 DeepSNAP数据集可视化
from deepsnap.dataset import GraphDataset dataset = GraphDataset([hete], task='node') # Splitting the dataset dataset_train, dataset_val, dataset_test = dataset.split(transductive=True, split_ratio=[0.4, 0.3, 0.3]) titles = ['Train', 'Validation', 'Test'] for i, dataset in enumerate([dataset_train, dataset_val, dataset_test]): n0 = hete._convert_to_graph_index(dataset[0].node_label_index['n0'], 'n0').tolist() #[21, 5, 7, 8, 16, 11] #看上下文应该是返回该split中node_type为n0的节点的索引。_convert_to_graph_index()返回Tensor n1 = hete._convert_to_graph_index(dataset[0].node_label_index['n1'], 'n1').tolist() plt.figure(figsize=(7, 7)) plt.title(titles[i]) nx.draw(G_orig, pos=pos, node_color="grey", edge_color=colors, labels=labels, font_color='white') nx.draw_networkx_nodes(G_orig.subgraph(n0), pos=pos, node_color="blue") #subgraph()应该是返回node-induced subgraph的意思,但我找不到对应的文档,算了 nx.draw_networkx_nodes(G_orig.subgraph(n1), pos=pos, node_color="red") show()
2. 异质图节点预测任务
这一部分问题应该是修改自DeepSNAP官方的异质图节点预测任务示例代码:deepsnap/node_classification_acm.py at master · snap-stanford/deepsnap
所以我答案也是从别人写的colab4中抄了一部分,从这个里面抄了一部分(毕竟据我猜测老师出这个题就是照着这个官方答案魔改的)。
首先我们假设有一个图 G GG,其有2种node types a aa 和 b bb,3种three message types m 1 = ( a , r 1 , a ) , m 2 = ( a , r 2 , b ) 和 m 3 = ( a , r 3 , b )。
一个heterogeneous layer要包含3个Heterogeneous GNN layers(本colab中的 HeteroGNNConv),每个 HeteroGNNConv 层只对一种message type做message passing和aggregation。
整体算法流程:
在本colab中,第 l ll 层heterogeneous GNN layer由第 l ll 层Heterogeneous GNN Wrapper layer(即本colab中的 HeteroGNNWrapperConv)进行管理,它直接通过上一层的节点嵌入进行信息传递、聚合到下一层的节点嵌入。
整体算法流程:
2.1 导包
import copy import torch import deepsnap import numpy as np import torch.nn as nn import torch.nn.functional as F import torch_geometric.nn as pyg_nn from sklearn.metrics import f1_score from deepsnap.hetero_gnn import forward_op from deepsnap.hetero_graph import HeteroGraph from torch_sparse import SparseTensor, matmul
2.2 Heterogeneous GNN Layer
class HeteroGNNConv(pyg_nn.MessagePassing): def __init__(self, in_channels_src, in_channels_dst, out_channels): super(HeteroGNNConv, self).__init__(aggr="mean") self.in_channels_src = in_channels_src self.in_channels_dst = in_channels_dst self.out_channels = out_channels self.lin_dst=nn.Linear(in_channels_dst,out_channels) #W_d^{(l)[m]} self.lin_src=nn.Linear(in_channels_src,out_channels) #W_s^{(l)[m]} self.lin_update=nn.Linear(out_channels*2,out_channels) #W^{(l)[m]} def forward( self, node_feature_src, node_feature_dst, edge_index, size=None, res_n_id=None, ): return self.propagate(edge_index,size=size, node_feature_src=node_feature_src, node_feature_dst=node_feature_dst,res_n_id=res_n_id) def message_and_aggregate(self, edge_index, node_feature_src): # Here edge_index is torch_sparse SparseTensor. out=matmul(edge_index,node_feature_src,reduce=self.aggr) #实不相瞒,我没看懂,但是算了,以后再说吧 return out def update(self, aggr_out, node_feature_dst, res_n_id): aggr_out=self.lin_src(aggr_out) node_feature_dst=self.lin_dst(node_feature_dst) concat_features = torch.cat((node_feature_dst, aggr_out),dim=-1) #维度-1在这里就是维度1 aggr_out = self.lin_update(concat_features) return aggr_out
2.3 Heterogeneous GNN Wrapper Layer
在对每一种message type应用GNN层(HeteroGNNConv)时,我们需要在每一层上将它们聚合起来。
在本colab中将应用两种聚合方式。
第一种:mean
节点 v 的node type是 d,M 是destination node的node type是 d dd 的message type的数量。
第二种:semantic level attention introduced in HAN (Wang et al. (2019))
m 是message type,d 是destination node type。
class HeteroGNNWrapperConv(deepsnap.hetero_gnn.HeteroConv): #文档:https://snap.stanford.edu/deepsnap/modules/hetero_gnn.html def __init__(self, convs, args, aggr="mean"): super(HeteroGNNWrapperConv, self).__init__(convs, None) self.aggr = aggr # Map the index and message type self.mapping = {} # A numpy array that stores the final attention probability self.alpha = None self.attn_proj = None if self.aggr == "attn": self.attn_proj = nn.Sequential( nn.Linear(args['hidden_size'], args['attn_size']), nn.Tanh(), nn.Linear(args['attn_size'], 1, bias=False), ) def reset_parameters(self): super(HeteroConvWrapper, self).reset_parameters() if self.aggr == "attn": for layer in self.attn_proj.children(): layer.reset_parameters() def forward(self, node_features, edge_indices): #edge_indices: 字典,key是message type,value是对应的edge_index Tensor message_type_emb = {} for message_key, message_type in edge_indices.items(): src_type, edge_type, dst_type = message_key node_feature_src = node_features[src_type] node_feature_dst = node_features[dst_type] edge_index = edge_indices[message_key] message_type_emb[message_key] = ( self.convs[message_key]( node_feature_src, node_feature_dst, edge_index, ) ) node_emb = {dst: [] for _, _, dst in message_type_emb.keys()} mapping = {} for (src, edge_type, dst), item in message_type_emb.items(): mapping[len(node_emb[dst])] = (src, edge_type, dst) node_emb[dst].append(item) #mapping示例: {0: ('paper', 'author', 'paper'), 1: ('paper', 'subject', 'paper')} self.mapping = mapping for node_type, embs in node_emb.items(): if len(embs) == 1: node_emb[node_type] = embs[0] else: node_emb[node_type] = self.aggregate(embs) return node_emb def aggregate(self, xs): #xs是Tensor(message type的embeddings)的list if self.aggr == "mean": x = torch.stack(xs, dim=-1) return x.mean(dim=-1) elif self.aggr == "attn": N = xs[0].shape[0] # Number of nodes for that node type M = len(xs) # Number of message types for that node type x = torch.cat(xs, dim=0).view(M, N, -1) # M * N * D z = self.attn_proj(x).view(M, N) # M * N * 1 z = z.mean(1) # M * 1 alpha = torch.softmax(z, dim=0) # M * 1 # Store the attention result to self.alpha as np array self.alpha = alpha.view(-1).data.cpu().numpy() #(len(xs),) #self.alpha不用于反向传播等操作,仅用于看不同层对不同message type的attention值 alpha = alpha.view(M, 1, 1) x = x * alpha return x.sum(dim=0)
2.4 初始化Heterogeneous GNN Layers
def generate_convs(hetero_graph, conv, hidden_size, first_layer=False): """ 入参: hetero_graph:DeepSNAP `HeteroGraph` object conv: HeteroGNNConv 第一层:输入维度为特征维度,输出维度为隐藏层维度 非第一层:输入维度为隐藏层维度,输出维度也是隐藏层维度 返回值:一个 `HeteroGNNConv` 层的字典,key是message types。 """ convs = {} for message_type in hetero_graph.message_types: if first_layer is True: src_type = message_type[0] dst_type = message_type[2] src_size = hetero_graph.num_node_features(src_type) dst_size = hetero_graph.num_node_features(dst_type) convs[message_type] = conv(src_size,dst_size, hidden_size) else: convs[message_type] = conv(hidden_size, hidden_size, hidden_size) return convs
注意这里推荐使用 deepsnap.hetero_graph.HeteroGraph.num_node_features(node_type) 方法,但是经我测试在question 1中建立的异质图 hete 上运行 hete.num_node_features('n1') 会报错:AttributeError: 'list' object has no attribute 'shape'
这应该是因为 hete 上的特征是list格式而非Tensor格式,我觉得这是DeepSNAP尚有不足之处。
2.5 HeteroGNN
我们建立一个包含2层 HeteroGNNWrapperConv 的HeteroGNN模型。
self.convs1 → self.bns1 → self.relus1 → self.convs2 → self.bns2 → self.relus2 → self.post_mps
class HeteroGNN(torch.nn.Module): def __init__(self, hetero_graph, args, aggr="mean"): super(HeteroGNN, self).__init__() self.aggr = aggr self.hidden_size = args['hidden_size'] self.bns1 = nn.ModuleDict() self.bns2 = nn.ModuleDict() self.relus1 = nn.ModuleDict() self.relus2 = nn.ModuleDict() self.post_mps = nn.ModuleDict() convs1 = generate_convs(hetero_graph, HeteroGNNConv, self.hidden_size, first_layer=True) convs2 = generate_convs(hetero_graph, HeteroGNNConv, self.hidden_size) self.convs1 = HeteroGNNWrapperConv(convs1, args, aggr=self.aggr) self.convs2 = HeteroGNNWrapperConv(convs2, args, aggr=self.aggr) for node_type in hetero_graph.node_types: self.bns1[node_type] = torch.nn.BatchNorm1d(self.hidden_size, eps=1) self.bns2[node_type] = torch.nn.BatchNorm1d(self.hidden_size, eps=1) self.post_mps[node_type] = nn.Linear(self.hidden_size, hetero_graph.num_node_labels(node_type)) self.relus1[node_type] = nn.LeakyReLU() self.relus2[node_type] = nn.LeakyReLU() def forward(self, node_feature, edge_index): #node_feature是一个字典,key是node types,values是对应的feature Tensors #edge_index也是一个字典,字典,key是message types,value是对应的edge_index Tensor x = node_feature x = self.convs1(x, edge_index) x = forward_op(x, self.bns1) #这个方法介绍见下 x = forward_op(x, self.relus1) x = self.convs2(x, edge_index) x = forward_op(x, self.bns2) x = forward_op(x, self.relus2) x = forward_op(x, self.post_mps) return x def loss(self, preds, y, indices): loss = 0 loss_func = F.cross_entropy for node_type in preds: idx = indices[node_type] loss += loss_func(preds[node_type][idx], y[node_type][idx]) return loss
forward_op(x, module_dict, **kwargs):
文档:deepsnap.hetero_gnn.forward_op
大意来说就是给定如代码所示格式的 x 和 module_dict 参数,forward_op() 方法会按照二者对应的key来对应地按照给定的参数将 x 的value运行在 module_dict 的value上。
2.6 构建 train() 和 test() 函数
def train(model, optimizer, hetero_graph, train_idx): model.train() optimizer.zero_grad() preds = model(hetero_graph.node_feature, hetero_graph.edge_index) loss = model.loss(preds, hetero_graph.node_label, train_idx) loss.backward() optimizer.step() return loss.item() def test(model, graph, indices, best_model=None, best_val=0): model.eval() accs = [] for index in indices: preds = model(graph.node_feature, graph.edge_index) num_node_types = 0 micro = 0 macro = 0 for node_type in preds: idx = index[node_type] pred = preds[node_type][idx] pred = pred.max(1)[1] label_np = graph.node_label[node_type][idx].cpu().numpy() pred_np = pred.cpu().numpy() micro = f1_score(label_np, pred_np, average='micro') macro = f1_score(label_np, pred_np, average='macro') num_node_types += 1 #注意这里,实际上对F1 score求平均是没有意义的 #但是在我们的例子中其实只有一种node type所以也无所谓了…… micro /= num_node_types macro /= num_node_types accs.append((micro, macro)) if accs[1][0] > best_val: best_val = accs[1][0] best_model = copy.deepcopy(model) #注意这里要深拷贝!我就被这个深拷贝浅拷贝坑过! #反正先记住这里要深拷贝好了,以后我还准备专门写博文讲一下这个深拷贝浅拷贝直接引用的事 return accs, best_model, best_val
2.7 设置超参
args = { 'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'), 'hidden_size': 64, 'epochs': 100, 'weight_decay': 1e-5, 'lr': 0.003, 'attn_size': 32, }
2.8 数据集导入及预处理
在这一环节中我们将使用Tensor backend而非NetworkX backend了。
在本colab中使用的 ACM(3025) 数据集来源于 HAN (Wang et al. (2019)),本colab的数据集提取自DGL的ACM.mat。
原始的ACM数据集有3种node types和2种edge (relation) types。为简化起见,我们将其简化为1种node type和2种edge types:
所以在我们的数据集中,只有一种node type (paper) 和2种message types (paper, author, paper) and (paper, subject, paper)
数据集下载方式见我的GitHub项目的README文件2021/6/21更新部分:PolarisRisingWar/cs224w-2021-winter-colab: cs224w(图机器学习)2021冬季课程的colab
print("Device: {}".format(args['device'])) # Load the data data = torch.load("acm.pkl") #data是一个字典,key是str,value是Tensor # Message types message_type_1 = ("paper", "author", "paper") message_type_2 = ("paper", "subject", "paper") # Dictionary of edge indices edge_index = {} edge_index[message_type_1] = data['pap'] edge_index[message_type_2] = data['psp'] # Dictionary of node features node_feature = {} node_feature["paper"] = data['feature'] # Dictionary of node labels node_label = {} node_label["paper"] = data['label'] # Load the train, validation and test indices train_idx = {"paper": data['train_idx'].to(args['device'])} val_idx = {"paper": data['val_idx'].to(args['device'])} test_idx = {"paper": data['test_idx'].to(args['device'])} # Construct a deepsnap tensor backend HeteroGraph hetero_graph = HeteroGraph( node_feature=node_feature, node_label=node_label, edge_index=edge_index, directed=True ) print(f"ACM heterogeneous graph: {hetero_graph.num_nodes()} nodes, {hetero_graph.num_edges()} edges") # Node feature and node label to device for key in hetero_graph.node_feature: hetero_graph.node_feature[key] = hetero_graph.node_feature[key].to(args['device']) for key in hetero_graph.node_label: hetero_graph.node_label[key] = hetero_graph.node_label[key].to(args['device']) # Edge_index to sparse tensor and to device for key in hetero_graph.edge_index: edge_index = hetero_graph.edge_index[key] adj = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(hetero_graph.num_nodes('paper'), hetero_graph.num_nodes('paper'))) hetero_graph.edge_index[key] = adj.t().to(args['device']) print(hetero_graph.edge_index[message_type_1]) print(hetero_graph.edge_index[message_type_2])
输出内容:
Device: cuda ACM heterogeneous graph: {'paper': 3025} nodes, {('paper', 'author', 'paper'): 26256, ('paper', 'subject', 'paper'): 2207736} edges SparseTensor(row=tensor([ 0, 0, 0, ..., 3024, 3024, 3024], device='cuda:0'), col=tensor([ 8, 20, 51, ..., 2948, 2983, 2991], device='cuda:0'), size=(3025, 3025), nnz=26256, density=0.29%) SparseTensor(row=tensor([ 0, 0, 0, ..., 3024, 3024, 3024], device='cuda:0'), col=tensor([ 75, 434, 534, ..., 3020, 3021, 3022], device='cuda:0'), size=(3025, 3025), nnz=2207736, density=24.13%)
2.9 Training the Mean Aggregation
best_model = None best_val = 0 model = HeteroGNN(hetero_graph, args, aggr="mean").to(args['device']) optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) for epoch in range(args['epochs']): loss = train(model, optimizer, hetero_graph, train_idx) accs, best_model, best_val = test(model, hetero_graph, [train_idx, val_idx, test_idx], best_model, best_val) print( f"Epoch {epoch + 1}: loss {round(loss, 5)}, " f"train micro {round(accs[0][0] * 100, 2)}%, train macro {round(accs[0][1] * 100, 2)}%, " f"valid micro {round(accs[1][0] * 100, 2)}%, valid macro {round(accs[1][1] * 100, 2)}%, " f"test micro {round(accs[2][0] * 100, 2)}%, test macro {round(accs[2][1] * 100, 2)}%" ) best_accs, _, _ = test(best_model, hetero_graph, [train_idx, val_idx, test_idx]) print( f"Best model: " f"train micro {round(best_accs[0][0] * 100, 2)}%, train macro {round(best_accs[0][1] * 100, 2)}%, " f"valid micro {round(best_accs[1][0] * 100, 2)}%, valid macro {round(best_accs[1][1] * 100, 2)}%, " f"test micro {round(best_accs[2][0] * 100, 2)}%, test macro {round(best_accs[2][1] * 100, 2)}%" )
每一轮的输出略,最好模型的输出:
Best model: train micro 99.83%, train macro 99.83%, valid micro 98.33%, valid macro 98.33%, test micro 87.86%, test macro 87.78%
2.10 Training the Attention Aggregation
best_model = None best_val = 0 output_size = hetero_graph.num_node_labels('paper') model = HeteroGNN(hetero_graph, args, aggr="attn").to(args['device']) optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) for epoch in range(args['epochs']): loss = train(model, optimizer, hetero_graph, train_idx) accs, best_model, best_val = test(model, hetero_graph, [train_idx, val_idx, test_idx], best_model, best_val) print( f"Epoch {epoch + 1}: loss {round(loss, 5)}, " f"train micro {round(accs[0][0] * 100, 2)}%, train macro {round(accs[0][1] * 100, 2)}%, " f"valid micro {round(accs[1][0] * 100, 2)}%, valid macro {round(accs[1][1] * 100, 2)}%, " f"test micro {round(accs[2][0] * 100, 2)}%, test macro {round(accs[2][1] * 100, 2)}%" ) best_accs, _, _ = test(best_model, hetero_graph, [train_idx, val_idx, test_idx]) print( f"Best model: " f"train micro {round(best_accs[0][0] * 100, 2)}%, train macro {round(best_accs[0][1] * 100, 2)}%, " f"valid micro {round(best_accs[1][0] * 100, 2)}%, valid macro {round(best_accs[1][1] * 100, 2)}%, " f"test micro {round(best_accs[2][0] * 100, 2)}%, test macro {round(best_accs[2][1] * 100, 2)}%" )
每一轮的输出略,最好模型的输出:
Best model: train micro 99.67%, train macro 99.67%, valid micro 97.67%, valid macro 97.66%, test micro 85.79%, test macro 85.27%
2.11 Attention for each Message Type
if model.convs1.alpha is not None and model.convs2.alpha is not None: for idx, message_type in model.convs1.mapping.items(): print(f"Layer 1 has attention {model.convs1.alpha[idx]} on message type {message_type}") for idx, message_type in model.convs2.mapping.items(): print(f"Layer 2 has attention {model.convs2.alpha[idx]} on message type {message_type}")
输出:
Layer 1 has attention 0.960588812828064 on message type ('paper', 'author', 'paper') Layer 1 has attention 0.03941113129258156 on message type ('paper', 'subject', 'paper') Layer 2 has attention 0.30975428223609924 on message type ('paper', 'author', 'paper') Layer 2 has attention 0.6902456879615784 on message type ('paper', 'subject', 'pape