You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeiler_lehman.py 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 14 15:16:34 2020
  5. @author: ljia
  6. @references:
  7. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  8. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  9. 2011;12(Sep):2539-61.
  10. """
  11. import numpy as np
  12. import networkx as nx
  13. from collections import Counter
  14. from functools import partial
  15. from gklearn.utils.parallel import parallel_gm
  16. from gklearn.kernels import GraphKernel
  17. class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel.
  18. def __init__(self, **kwargs):
  19. GraphKernel.__init__(self)
  20. self.__node_labels = kwargs.get('node_labels', [])
  21. self.__edge_labels = kwargs.get('edge_labels', [])
  22. self.__height = int(kwargs.get('height', 0))
  23. self.__base_kernel = kwargs.get('base_kernel', 'subtree')
  24. self.__ds_infos = kwargs.get('ds_infos', {})
  25. def _compute_gm_series(self):
  26. self.__add_dummy_node_labels(self._graphs)
  27. # for WL subtree kernel
  28. if self.__base_kernel == 'subtree':
  29. gram_matrix = self.__subtree_kernel_do(self._graphs)
  30. # for WL shortest path kernel
  31. elif self.__base_kernel == 'sp':
  32. gram_matrix = self.__sp_kernel_do(self._graphs)
  33. # for WL edge kernel
  34. elif self.__base_kernel == 'edge':
  35. gram_matrix = self.__edge_kernel_do(self._graphs)
  36. # for user defined base kernel
  37. else:
  38. gram_matrix = self.__user_kernel_do(self._graphs)
  39. return gram_matrix
  40. def _compute_gm_imap_unordered(self):
  41. if self._verbose >= 2:
  42. raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
  43. return self._compute_gm_series()
  44. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
  45. self.__add_dummy_node_labels(g_list + [g1])
  46. # for WL subtree kernel
  47. if self.__base_kernel == 'subtree':
  48. gram_matrix = self.__subtree_kernel_do(g_list + [g1])
  49. # for WL shortest path kernel
  50. elif self.__base_kernel == 'sp':
  51. gram_matrix = self.__sp_kernel_do(g_list + [g1])
  52. # for WL edge kernel
  53. elif self.__base_kernel == 'edge':
  54. gram_matrix = self.__edge_kernel_do(g_list + [g1])
  55. # for user defined base kernel
  56. else:
  57. gram_matrix = self.__user_kernel_do(g_list + [g1])
  58. return list(gram_matrix[-1][0:-1])
  59. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  60. if self._verbose >= 2:
  61. raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
  62. return self._compute_gm_imap_unordered()
  63. def _wrapper_kernel_list_do(self, itr):
  64. pass
  65. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
  66. self.__add_dummy_node_labels([g1] + [g2])
  67. # for WL subtree kernel
  68. if self.__base_kernel == 'subtree':
  69. gram_matrix = self.__subtree_kernel_do([g1] + [g2])
  70. # for WL shortest path kernel
  71. elif self.__base_kernel == 'sp':
  72. gram_matrix = self.__sp_kernel_do([g1] + [g2])
  73. # for WL edge kernel
  74. elif self.__base_kernel == 'edge':
  75. gram_matrix = self.__edge_kernel_do([g1] + [g2])
  76. # for user defined base kernel
  77. else:
  78. gram_matrix = self.__user_kernel_do([g1] + [g2])
  79. return gram_matrix[0][1]
  80. def __subtree_kernel_do(self, Gn):
  81. """Calculate Weisfeiler-Lehman kernels between graphs.
  82. Parameters
  83. ----------
  84. Gn : List of NetworkX graph
  85. List of graphs between which the kernels are calculated.
  86. Return
  87. ------
  88. gram_matrix : Numpy matrix
  89. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  90. """
  91. gram_matrix = np.zeros((len(Gn), len(Gn)))
  92. # initial for height = 0
  93. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  94. # for each graph
  95. for G in Gn:
  96. # set all labels into a tuple.
  97. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  98. G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self.__node_labels)
  99. # get the set of original labels
  100. labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
  101. # number of occurence of each label in G
  102. all_num_of_each_label.append(dict(Counter(labels_ori)))
  103. # calculate subtree kernel with the 0th iteration and add it to the final kernel.
  104. self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  105. # iterate each height
  106. for h in range(1, self.__height + 1):
  107. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  108. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  109. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  110. all_num_of_each_label = [] # number of occurence of each label in G
  111. # @todo: parallel this part.
  112. for idx, G in enumerate(Gn):
  113. all_multisets = []
  114. for node, attrs in G.nodes(data=True):
  115. # Multiset-label determination.
  116. multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
  117. # sorting each multiset
  118. multiset.sort()
  119. multiset = [attrs['label_tuple']] + multiset # add the prefix
  120. all_multisets.append(tuple(multiset))
  121. # label compression
  122. set_unique = list(set(all_multisets)) # set of unique multiset labels
  123. # a dictionary mapping original labels to new ones.
  124. set_compressed = {}
  125. # if a label occured before, assign its former compressed label,
  126. # else assign the number of labels occured + 1 as the compressed label.
  127. for value in set_unique:
  128. if value in all_set_compressed.keys():
  129. set_compressed.update({value: all_set_compressed[value]})
  130. else:
  131. set_compressed.update({value: str(num_of_labels_occured + 1)})
  132. num_of_labels_occured += 1
  133. all_set_compressed.update(set_compressed)
  134. # relabel nodes
  135. for idx, node in enumerate(G.nodes()):
  136. G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
  137. # get the set of compressed labels
  138. labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
  139. # all_labels_ori.update(labels_comp)
  140. all_num_of_each_label.append(dict(Counter(labels_comp)))
  141. # calculate subtree kernel with h iterations and add it to the final kernel
  142. self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  143. return gram_matrix
  144. def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn):
  145. """Compute Gram matrix using the base kernel.
  146. """
  147. if self._parallel == 'imap_unordered':
  148. # compute kernels.
  149. def init_worker(alllabels_toshare):
  150. global G_alllabels
  151. G_alllabels = alllabels_toshare
  152. do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
  153. parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
  154. glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
  155. elif self._parallel is None:
  156. for i in range(len(gram_matrix)):
  157. for j in range(i, len(gram_matrix)):
  158. gram_matrix[i][j] = self.__compute_subtree_kernel(all_num_of_each_label[i],
  159. all_num_of_each_label[j], gram_matrix[i][j])
  160. gram_matrix[j][i] = gram_matrix[i][j]
  161. def __compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
  162. """Compute the subtree kernel.
  163. """
  164. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  165. vector1 = np.array([(num_of_each_label1[label]
  166. if (label in num_of_each_label1.keys()) else 0)
  167. for label in labels])
  168. vector2 = np.array([(num_of_each_label2[label]
  169. if (label in num_of_each_label2.keys()) else 0)
  170. for label in labels])
  171. kernel += np.dot(vector1, vector2)
  172. return kernel
  173. def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
  174. i = itr[0]
  175. j = itr[1]
  176. return i, j, self.__compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
  177. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  178. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  179. Parameters
  180. ----------
  181. Gn : List of NetworkX graph
  182. List of graphs between which the kernels are calculated.
  183. node_label : string
  184. node attribute used as label.
  185. edge_label : string
  186. edge attribute used as label.
  187. height : int
  188. subtree height.
  189. Return
  190. ------
  191. gram_matrix : Numpy matrix
  192. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  193. """
  194. pass
  195. from gklearn.utils.utils import getSPGraph
  196. # init.
  197. height = int(height)
  198. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  199. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  200. # initial for height = 0
  201. for i in range(0, len(Gn)):
  202. for j in range(i, len(Gn)):
  203. for e1 in Gn[i].edges(data = True):
  204. for e2 in Gn[j].edges(data = True):
  205. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  206. gram_matrix[i][j] += 1
  207. gram_matrix[j][i] = gram_matrix[i][j]
  208. # iterate each height
  209. for h in range(1, height + 1):
  210. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  211. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  212. for G in Gn: # for each graph
  213. set_multisets = []
  214. for node in G.nodes(data = True):
  215. # Multiset-label determination.
  216. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  217. # sorting each multiset
  218. multiset.sort()
  219. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  220. set_multisets.append(multiset)
  221. # label compression
  222. set_unique = list(set(set_multisets)) # set of unique multiset labels
  223. # a dictionary mapping original labels to new ones.
  224. set_compressed = {}
  225. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  226. for value in set_unique:
  227. if value in all_set_compressed.keys():
  228. set_compressed.update({ value : all_set_compressed[value] })
  229. else:
  230. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  231. num_of_labels_occured += 1
  232. all_set_compressed.update(set_compressed)
  233. # relabel nodes
  234. for node in G.nodes(data = True):
  235. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  236. # calculate subtree kernel with h iterations and add it to the final kernel
  237. for i in range(0, len(Gn)):
  238. for j in range(i, len(Gn)):
  239. for e1 in Gn[i].edges(data = True):
  240. for e2 in Gn[j].edges(data = True):
  241. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  242. gram_matrix[i][j] += 1
  243. gram_matrix[j][i] = gram_matrix[i][j]
  244. return gram_matrix
  245. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  246. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  247. Parameters
  248. ----------
  249. Gn : List of NetworkX graph
  250. List of graphs between which the kernels are calculated.
  251. node_label : string
  252. node attribute used as label.
  253. edge_label : string
  254. edge attribute used as label.
  255. height : int
  256. subtree height.
  257. Return
  258. ------
  259. gram_matrix : Numpy matrix
  260. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  261. """
  262. pass
  263. # init.
  264. height = int(height)
  265. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  266. # initial for height = 0
  267. for i in range(0, len(Gn)):
  268. for j in range(i, len(Gn)):
  269. for e1 in Gn[i].edges(data = True):
  270. for e2 in Gn[j].edges(data = True):
  271. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  272. gram_matrix[i][j] += 1
  273. gram_matrix[j][i] = gram_matrix[i][j]
  274. # iterate each height
  275. for h in range(1, height + 1):
  276. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  277. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  278. for G in Gn: # for each graph
  279. set_multisets = []
  280. for node in G.nodes(data = True):
  281. # Multiset-label determination.
  282. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  283. # sorting each multiset
  284. multiset.sort()
  285. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  286. set_multisets.append(multiset)
  287. # label compression
  288. set_unique = list(set(set_multisets)) # set of unique multiset labels
  289. # a dictionary mapping original labels to new ones.
  290. set_compressed = {}
  291. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  292. for value in set_unique:
  293. if value in all_set_compressed.keys():
  294. set_compressed.update({ value : all_set_compressed[value] })
  295. else:
  296. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  297. num_of_labels_occured += 1
  298. all_set_compressed.update(set_compressed)
  299. # relabel nodes
  300. for node in G.nodes(data = True):
  301. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  302. # calculate subtree kernel with h iterations and add it to the final kernel
  303. for i in range(0, len(Gn)):
  304. for j in range(i, len(Gn)):
  305. for e1 in Gn[i].edges(data = True):
  306. for e2 in Gn[j].edges(data = True):
  307. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  308. gram_matrix[i][j] += 1
  309. gram_matrix[j][i] = gram_matrix[i][j]
  310. return gram_matrix
  311. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  312. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  313. Parameters
  314. ----------
  315. Gn : List of NetworkX graph
  316. List of graphs between which the kernels are calculated.
  317. node_label : string
  318. node attribute used as label.
  319. edge_label : string
  320. edge attribute used as label.
  321. height : int
  322. subtree height.
  323. base_kernel : string
  324. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  325. Return
  326. ------
  327. gram_matrix : Numpy matrix
  328. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  329. """
  330. pass
  331. # init.
  332. height = int(height)
  333. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  334. # initial for height = 0
  335. gram_matrix = base_kernel(Gn, node_label, edge_label)
  336. # iterate each height
  337. for h in range(1, height + 1):
  338. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  339. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  340. for G in Gn: # for each graph
  341. set_multisets = []
  342. for node in G.nodes(data = True):
  343. # Multiset-label determination.
  344. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  345. # sorting each multiset
  346. multiset.sort()
  347. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  348. set_multisets.append(multiset)
  349. # label compression
  350. set_unique = list(set(set_multisets)) # set of unique multiset labels
  351. # a dictionary mapping original labels to new ones.
  352. set_compressed = {}
  353. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  354. for value in set_unique:
  355. if value in all_set_compressed.keys():
  356. set_compressed.update({ value : all_set_compressed[value] })
  357. else:
  358. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  359. num_of_labels_occured += 1
  360. all_set_compressed.update(set_compressed)
  361. # relabel nodes
  362. for node in G.nodes(data = True):
  363. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  364. # calculate kernel with h iterations and add it to the final kernel
  365. gram_matrix += base_kernel(Gn, node_label, edge_label)
  366. return gram_matrix
  367. def __add_dummy_node_labels(self, Gn):
  368. if len(self.__node_labels) == 0:
  369. for G in Gn:
  370. nx.set_node_attributes(G, '0', 'dummy')
  371. self.__node_labels.append('dummy')

A Python package for graph kernels, graph edit distances and graph pre-image problem.