You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import networkx as nx
  5. import numpy as np
  6. import time
  7. from pygraph.kernels.spkernel import spkernel
  8. from pygraph.kernels.pathKernel import pathkernel
  9. # test of WL subtree kernel on many graphs
  10. import sys
  11. import pathlib
  12. from collections import Counter
  13. sys.path.insert(0, "../")
  14. import networkx as nx
  15. import numpy as np
  16. import time
  17. from pygraph.kernels.spkernel import spkernel
  18. from pygraph.kernels.pathKernel import pathkernel
  19. def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  20. """Calculate Weisfeiler-Lehman kernels between graphs.
  21. Parameters
  22. ----------
  23. Gn : List of NetworkX graph
  24. List of graphs between which the kernels are calculated.
  25. /
  26. G1, G2 : NetworkX graphs
  27. 2 graphs between which the kernel is calculated.
  28. node_label : string
  29. node attribute used as label. The default node label is atom.
  30. edge_label : string
  31. edge attribute used as label. The default edge label is bond_type.
  32. height : int
  33. subtree height
  34. base_kernel : string
  35. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  36. Return
  37. ------
  38. Kmatrix/Kernel : Numpy matrix/int
  39. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.
  40. Notes
  41. -----
  42. This function now supports WL subtree kernel and WL shortest path kernel.
  43. References
  44. ----------
  45. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  46. """
  47. if len(args) == 1: # for a list of graphs
  48. start_time = time.time()
  49. # for WL subtree kernel
  50. if base_kernel == 'subtree':
  51. Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height = height, base_kernel = 'subtree')
  52. # for WL edge kernel
  53. elif base_kernel == 'edge':
  54. print('edge')
  55. # for WL shortest path kernel
  56. elif base_kernel == 'sp':
  57. Gn = args[0]
  58. Kmatrix = np.zeros((len(Gn), len(Gn)))
  59. for i in range(0, len(Gn)):
  60. for j in range(i, len(Gn)):
  61. Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j], height = height)
  62. Kmatrix[j][i] = Kmatrix[i][j]
  63. run_time = time.time() - start_time
  64. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time))
  65. return Kmatrix, run_time
  66. else: # for only 2 graphs
  67. start_time = time.time()
  68. # for WL subtree kernel
  69. if base_kernel == 'subtree':
  70. args = [args[0], args[1]]
  71. kernel = _wl_subtreekernel_do(args, node_label, edge_label, height = height, base_kernel = 'subtree')
  72. # for WL edge kernel
  73. elif base_kernel == 'edge':
  74. print('edge')
  75. # for WL shortest path kernel
  76. elif base_kernel == 'sp':
  77. kernel = _pathkernel_do(args[0], args[1])
  78. run_time = time.time() - start_time
  79. print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, run_time))
  80. return kernel, run_time
  81. def _wl_subtreekernel_do(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  82. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  83. Parameters
  84. ----------
  85. Gn : List of NetworkX graph
  86. List of graphs between which the kernels are calculated.
  87. node_label : string
  88. node attribute used as label. The default node label is atom.
  89. edge_label : string
  90. edge attribute used as label. The default edge label is bond_type.
  91. height : int
  92. subtree height
  93. base_kernel : string
  94. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  95. Return
  96. ------
  97. Kmatrix/Kernel : Numpy matrix/int
  98. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  99. """
  100. height = int(height)
  101. Gn = args[0]
  102. Kmatrix = np.zeros((len(Gn), len(Gn)))
  103. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  104. # initial for height = 0
  105. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  106. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  107. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  108. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  109. # for each graph
  110. for G in Gn:
  111. # get the set of original labels
  112. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  113. all_labels_ori.update(labels_ori)
  114. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  115. all_num_of_each_label.append(num_of_each_label)
  116. num_of_labels = len(num_of_each_label) # number of all unique labels
  117. all_labels_ori.update(labels_ori)
  118. all_num_of_labels_occured += len(all_labels_ori)
  119. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  120. for i in range(0, len(Gn)):
  121. for j in range(i, len(Gn)):
  122. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  123. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  124. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  125. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  126. Kmatrix[j][i] = Kmatrix[i][j]
  127. # iterate each height
  128. for h in range(1, height + 1):
  129. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  130. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  131. all_labels_ori = set()
  132. all_num_of_each_label = []
  133. # for each graph
  134. for idx, G in enumerate(Gn):
  135. set_multisets = []
  136. for node in G.nodes(data = True):
  137. # Multiset-label determination.
  138. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  139. # sorting each multiset
  140. multiset.sort()
  141. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  142. set_multisets.append(multiset)
  143. # label compression
  144. set_unique = list(set(set_multisets)) # set of unique multiset labels
  145. # a dictionary mapping original labels to new ones.
  146. set_compressed = {}
  147. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  148. for value in set_unique:
  149. if value in all_set_compressed.keys():
  150. set_compressed.update({ value : all_set_compressed[value] })
  151. else:
  152. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  153. num_of_labels_occured += 1
  154. all_set_compressed.update(set_compressed)
  155. # relabel nodes
  156. for node in G.nodes(data = True):
  157. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  158. # get the set of compressed labels
  159. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  160. all_labels_ori.update(labels_comp)
  161. num_of_each_label = dict(Counter(labels_comp))
  162. all_num_of_each_label.append(num_of_each_label)
  163. all_num_of_labels_occured += len(all_labels_ori)
  164. # calculate subtree kernel with h iterations and add it to the final kernel
  165. for i in range(0, len(Gn)):
  166. for j in range(i, len(Gn)):
  167. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  168. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  169. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  170. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  171. Kmatrix[j][i] = Kmatrix[i][j]
  172. return Kmatrix
  173. def _weisfeilerlehmankernel_do(G1, G2, height = 0):
  174. """Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
  175. Parameters
  176. ----------
  177. G1, G2 : NetworkX graphs
  178. 2 graphs between which the kernel is calculated.
  179. Return
  180. ------
  181. Kernel : int
  182. Weisfeiler-Lehman Kernel between 2 graphs.
  183. """
  184. # init.
  185. height = int(height)
  186. kernel = 0 # init kernel
  187. num_nodes1 = G1.number_of_nodes()
  188. num_nodes2 = G2.number_of_nodes()
  189. # the first iteration.
  190. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  191. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  192. kernel += spkernel(G1, G2) # change your base kernel here (and one more below)
  193. for h in range(0, height + 1):
  194. # if labelset1 != labelset2:
  195. # break
  196. # Weisfeiler-Lehman test of graph isomorphism.
  197. relabel(G1)
  198. relabel(G2)
  199. # calculate kernel
  200. kernel += spkernel(G1, G2) # change your base kernel here (and one more before)
  201. # get label sets of both graphs
  202. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  203. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  204. return kernel
  205. def relabel(G):
  206. '''
  207. Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.
  208. Parameters
  209. ----------
  210. G : NetworkX graph
  211. The graphs whose nodes are relabeled.
  212. '''
  213. # get the set of original labels
  214. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  215. num_of_each_label = dict(Counter(labels_ori))
  216. num_of_labels = len(num_of_each_label)
  217. set_multisets = []
  218. for node in G.nodes(data = True):
  219. # Multiset-label determination.
  220. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  221. # sorting each multiset
  222. multiset.sort()
  223. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  224. set_multisets.append(multiset)
  225. # label compression
  226. # set_multisets.sort() # this is unnecessary
  227. set_unique = list(set(set_multisets)) # set of unique multiset labels
  228. set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
  229. # relabel nodes
  230. # nx.relabel_nodes(G, set_compressed, copy = False)
  231. for node in G.nodes(data = True):
  232. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  233. # get the set of compressed labels
  234. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  235. num_of_each_label.update(dict(Counter(labels_comp)))

A Python package for graph kernels, graph edit distances and graph pre-image problem.