You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untilHPathKernel.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. """
  2. @author: linlin
  3. @references: Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre
  4. Baldi. Graph kernels for chemical informatics. Neural networks,
  5. 18(8):1093–1110, 2005.
  6. """
  7. import sys
  8. sys.path.insert(0, "../")
  9. import time
  10. from collections import Counter
  11. from itertools import chain, combinations_with_replacement
  12. from functools import partial
  13. from multiprocessing import Pool
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. from suffix_tree import Tree, ukkonen
  18. from pygraph.utils.graphdataset import get_dataset_attributes
  19. def untilhpathkernel(*args,
  20. node_label='atom',
  21. edge_label='bond_type',
  22. depth=10,
  23. k_func='tanimoto',
  24. compute_method='naive',
  25. n_jobs=None):
  26. """Calculate path graph kernels up to depth/hight h between graphs.
  27. Parameters
  28. ----------
  29. Gn : List of NetworkX graph
  30. List of graphs between which the kernels are calculated.
  31. /
  32. G1, G2 : NetworkX graphs
  33. 2 graphs between which the kernel is calculated.
  34. node_label : string
  35. Node attribute used as label. The default node label is atom.
  36. edge_label : string
  37. Edge attribute used as label. The default edge label is bond_type.
  38. depth : integer
  39. Depth of search. Longest length of paths.
  40. k_func : function
  41. A kernel function applied using different notions of fingerprint
  42. similarity.
  43. compute_method: string
  44. Computation method, 'suffix_tree' or 'naive'.
  45. Return
  46. ------
  47. Kmatrix : Numpy matrix
  48. Kernel matrix, each element of which is the path kernel up to h between
  49. 2 praphs.
  50. """
  51. # pre-process
  52. depth = int(depth)
  53. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  54. Kmatrix = np.zeros((len(Gn), len(Gn)))
  55. ds_attrs = get_dataset_attributes(
  56. Gn,
  57. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  58. node_label=node_label, edge_label=edge_label)
  59. if not ds_attrs['node_labeled']:
  60. for G in Gn:
  61. nx.set_node_attributes(G, '0', 'atom')
  62. if not ds_attrs['edge_labeled']:
  63. for G in Gn:
  64. nx.set_edge_attributes(G, '0', 'bond_type')
  65. start_time = time.time()
  66. # ---- use pool.imap_unordered to parallel and track progress. ----
  67. # get all paths of all graphs before calculating kernels to save time,
  68. # but this may cost a lot of memory for large datasets.
  69. pool = Pool(n_jobs)
  70. all_paths = [[] for _ in range(len(Gn))]
  71. getps_partial = partial(wrapper_find_all_paths_until_length, depth,
  72. ds_attrs, node_label, edge_label)
  73. itr = zip(Gn, range(0, len(Gn)))
  74. if len(Gn) < 1000 * n_jobs:
  75. chunksize = int(len(Gn) / n_jobs) + 1
  76. else:
  77. chunksize = 1000
  78. for i, ps in tqdm(
  79. pool.imap_unordered(getps_partial, itr, chunksize),
  80. desc='getting paths', file=sys.stdout):
  81. all_paths[i] = ps
  82. pool.close()
  83. pool.join()
  84. # size = sys.getsizeof(all_paths)
  85. # for item in all_paths:
  86. # size += sys.getsizeof(item)
  87. # for pppps in item:
  88. # size += sys.getsizeof(pppps)
  89. # print(size)
  90. # ttt = time.time()
  91. # # ---- ---- use pool.map to parallel ----
  92. # for i, ps in tqdm(
  93. # pool.map(getps_partial, range(0, len(Gn))),
  94. # desc='getting paths', file=sys.stdout):
  95. # all_paths[i] = ps
  96. # print(time.time() - ttt)
  97. if compute_method == 'suffix_tree':
  98. pass
  99. else:
  100. pool = Pool(n_jobs)
  101. do_partial = partial(wrapper_uhpath_do_naive, k_func)
  102. itr = zip(combinations_with_replacement(all_paths, 2),
  103. combinations_with_replacement(range(0, len(Gn)), 2))
  104. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  105. if len_itr < 1000 * n_jobs:
  106. chunksize = int(len_itr / n_jobs) + 1
  107. else:
  108. chunksize = 1000
  109. for i, j, kernel in tqdm(
  110. pool.imap_unordered(do_partial, itr, chunksize),
  111. desc='calculating kernels', file=sys.stdout):
  112. Kmatrix[i][j] = kernel
  113. Kmatrix[j][i] = kernel
  114. pool.close()
  115. pool.join()
  116. # # ---- direct running, normally use single CPU core. ----
  117. # all_paths = [
  118. # find_all_paths_until_length(
  119. # Gn[i],
  120. # depth,
  121. # ds_attrs,
  122. # node_label=node_label,
  123. # edge_label=edge_label) for i in tqdm(
  124. # range(0, len(Gn)), desc='getting paths', file=sys.stdout)
  125. # ]
  126. #
  127. # if compute_method == 'suffix_tree':
  128. # # build generalized suffix tree of sets of paths for each graph.
  129. # all_gstree = [paths2GSuffixTree(all_paths[i]) for i in tqdm(
  130. # range(0, len(Gn)), desc='getting generalized suffix trees', file=sys.stdout)]
  131. #
  132. # pbar = tqdm(
  133. # total=((len(Gn) + 1) * len(Gn) / 2),
  134. # desc='calculating kernels',
  135. # file=sys.stdout)
  136. # for i in range(0, len(Gn)):
  137. # for j in range(i, len(Gn)):
  138. # Kmatrix[i][j] = _untilhpathkernel_do_gst(all_gstree[i],
  139. # all_gstree[j], all_paths[i], all_paths[j], k_func)
  140. # Kmatrix[j][i] = Kmatrix[i][j]
  141. # pbar.update(1)
  142. # else:
  143. # pbar = tqdm(
  144. # total=((len(Gn) + 1) * len(Gn) / 2),
  145. # desc='calculating kernels',
  146. # file=sys.stdout)
  147. # for i in range(0, len(Gn)):
  148. # for j in range(i, len(Gn)):
  149. # Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j],
  150. # k_func)
  151. # Kmatrix[j][i] = Kmatrix[i][j]
  152. # pbar.update(1)
  153. run_time = time.time() - start_time
  154. print(
  155. "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
  156. % (depth, len(Gn), run_time))
  157. return Kmatrix, run_time
  158. def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func):
  159. """Calculate path graph kernels up to depth d between 2 graphs using
  160. generalized suffix tree.
  161. Parameters
  162. ----------
  163. paths1, paths2 : list
  164. List of paths in 2 graphs, where for unlabeled graphs, each path is
  165. represented by a list of nodes; while for labeled graphs, each path is
  166. represented by a string consists of labels of nodes and/or edges on
  167. that path.
  168. k_func : function
  169. A kernel function applied using different notions of fingerprint
  170. similarity.
  171. Return
  172. ------
  173. kernel : float
  174. Path kernel up to h between 2 graphs.
  175. """
  176. all_paths = list(set(paths1 + paths2))
  177. if k_func == 'tanimoto':
  178. length_union = len(set(paths1 + paths2))
  179. kernel = (len(set(paths1)) + len(set(paths2)) -
  180. length_union) / length_union
  181. # vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  182. # vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  183. # kernel_uv = np.dot(vector1, vector2)
  184. # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  185. else: # MinMax kernel
  186. path_count1 = Counter(paths1)
  187. path_count2 = Counter(paths2)
  188. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  189. for key in all_paths]
  190. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  191. for key in all_paths]
  192. kernel = np.sum(np.minimum(vector1, vector2)) / \
  193. np.sum(np.maximum(vector1, vector2))
  194. return kernel
  195. def _untilhpathkernel_do_naive(paths1, paths2, k_func):
  196. """Calculate path graph kernels up to depth d between 2 graphs naively.
  197. Parameters
  198. ----------
  199. paths_list : list of list
  200. List of list of paths in all graphs, where for unlabeled graphs, each
  201. path is represented by a list of nodes; while for labeled graphs, each
  202. path is represented by a string consists of labels of nodes and/or
  203. edges on that path.
  204. k_func : function
  205. A kernel function applied using different notions of fingerprint
  206. similarity.
  207. Return
  208. ------
  209. kernel : float
  210. Path kernel up to h between 2 graphs.
  211. """
  212. all_paths = list(set(paths1 + paths2))
  213. if k_func == 'tanimoto':
  214. length_union = len(set(paths1 + paths2))
  215. kernel = (len(set(paths1)) + len(set(paths2)) -
  216. length_union) / length_union
  217. # vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  218. # vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  219. # kernel_uv = np.dot(vector1, vector2)
  220. # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  221. else: # MinMax kernel
  222. path_count1 = Counter(paths1)
  223. path_count2 = Counter(paths2)
  224. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  225. for key in all_paths]
  226. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  227. for key in all_paths]
  228. kernel = np.sum(np.minimum(vector1, vector2)) / \
  229. np.sum(np.maximum(vector1, vector2))
  230. return kernel
  231. def wrapper_uhpath_do_naive(k_func, itr_item):
  232. plist1 = itr_item[0][0]
  233. plist2 = itr_item[0][1]
  234. i = itr_item[1][0]
  235. j = itr_item[1][1]
  236. return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func)
  237. # @todo: (can be removed maybe) this method find paths repetively, it could be faster.
  238. def find_all_paths_until_length(G,
  239. length,
  240. ds_attrs,
  241. node_label='atom',
  242. edge_label='bond_type'):
  243. """Find all paths no longer than a certain maximum length in a graph. A
  244. recursive depth first search is applied.
  245. Parameters
  246. ----------
  247. G : NetworkX graphs
  248. The graph in which paths are searched.
  249. length : integer
  250. The maximum length of paths.
  251. ds_attrs: dict
  252. Dataset attributes.
  253. node_label : string
  254. Node attribute used as label. The default node label is atom.
  255. edge_label : string
  256. Edge attribute used as label. The default edge label is bond_type.
  257. Return
  258. ------
  259. path : list
  260. List of paths retrieved, where for unlabeled graphs, each path is
  261. represented by a list of nodes; while for labeled graphs, each path is
  262. represented by a list of strings consists of labels of nodes and/or
  263. edges on that path.
  264. """
  265. # path_l = [tuple([n]) for n in G.nodes] # paths of length l
  266. # all_paths = path_l[:]
  267. # for l in range(1, length + 1):
  268. # path_l_new = []
  269. # for path in path_l:
  270. # for neighbor in G[path[-1]]:
  271. # if len(path) < 2 or neighbor != path[-2]:
  272. # tmp = path + (neighbor, )
  273. # if tuple(tmp[::-1]) not in path_l_new:
  274. # path_l_new.append(tuple(tmp))
  275. # all_paths += path_l_new
  276. # path_l = path_l_new[:]
  277. path_l = [[n] for n in G.nodes] # paths of length l
  278. all_paths = path_l[:]
  279. for l in range(1, length + 1):
  280. path_lplus1 = []
  281. for path in path_l:
  282. for neighbor in G[path[-1]]:
  283. if neighbor not in path:
  284. tmp = path + [neighbor]
  285. # if tmp[::-1] not in path_lplus1:
  286. path_lplus1.append(tmp)
  287. all_paths += path_lplus1
  288. path_l = path_lplus1[:]
  289. # for i in range(0, length + 1):
  290. # new_paths = find_all_paths(G, i)
  291. # if new_paths == []:
  292. # break
  293. # all_paths.extend(new_paths)
  294. # consider labels
  295. if ds_attrs['node_labeled']:
  296. if ds_attrs['edge_labeled']:
  297. path_strs = [
  298. tuple(
  299. list(
  300. chain.from_iterable(
  301. (G.node[node][node_label],
  302. G[node][path[idx + 1]][edge_label])
  303. for idx, node in enumerate(path[:-1]))) +
  304. [G.node[path[-1]][node_label]]) for path in all_paths
  305. ]
  306. # path_strs = []
  307. # for path in all_paths:
  308. # strlist = list(
  309. # chain.from_iterable((G.node[node][node_label],
  310. # G[node][path[idx + 1]][edge_label])
  311. # for idx, node in enumerate(path[:-1])))
  312. # strlist.append(G.node[path[-1]][node_label])
  313. # path_strs.append(tuple(strlist))
  314. else:
  315. path_strs = [
  316. tuple([G.node[node][node_label] for node in path])
  317. for path in all_paths
  318. ]
  319. return path_strs
  320. else:
  321. if ds_attrs['edge_labeled']:
  322. return [
  323. tuple([] if len(path) == 1 else [
  324. G[node][path[idx + 1]][edge_label]
  325. for idx, node in enumerate(path[:-1])
  326. ]) for path in all_paths
  327. ]
  328. else:
  329. return [tuple([len(path)]) for path in all_paths]
  330. def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
  331. edge_label, itr_item):
  332. g = itr_item[0]
  333. i = itr_item[1]
  334. return i, find_all_paths_until_length(g, length, ds_attrs,
  335. node_label=node_label, edge_label=edge_label)
  336. def paths2GSuffixTree(paths):
  337. return Tree(paths, builder=ukkonen.Builder)
  338. # def find_paths(G, source_node, length):
  339. # """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied.
  340. # Parameters
  341. # ----------
  342. # G : NetworkX graphs
  343. # The graph in which paths are searched.
  344. # source_node : integer
  345. # The number of the node from where all paths start.
  346. # length : integer
  347. # The length of paths.
  348. # Return
  349. # ------
  350. # path : list of list
  351. # List of paths retrieved, where each path is represented by a list of nodes.
  352. # """
  353. # return [[source_node]] if length == 0 else \
  354. # [[source_node] + path for neighbor in G[source_node]
  355. # for path in find_paths(G, neighbor, length - 1) if source_node not in path]
  356. # def find_all_paths(G, length):
  357. # """Find all paths with a certain length in a graph. A recursive depth first search is applied.
  358. # Parameters
  359. # ----------
  360. # G : NetworkX graphs
  361. # The graph in which paths are searched.
  362. # length : integer
  363. # The length of paths.
  364. # Return
  365. # ------
  366. # path : list of list
  367. # List of paths retrieved, where each path is represented by a list of nodes.
  368. # """
  369. # all_paths = []
  370. # for node in G:
  371. # all_paths.extend(find_paths(G, node, length))
  372. # # The following process is not carried out according to the original article
  373. # # all_paths_r = [ path[::-1] for path in all_paths ]
  374. # # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  375. # # for idx, path in enumerate(all_paths[:-1]):
  376. # # for path2 in all_paths_r[idx+1::]:
  377. # # if path == path2:
  378. # # all_paths[idx] = []
  379. # # break
  380. # # return list(filter(lambda a: a != [], all_paths))
  381. # return all_paths

A Python package for graph kernels, graph edit distances and graph pre-image problem.