| @@ -0,0 +1,450 @@ | |||||
| """ | |||||
| @author: linlin | |||||
| @references: | |||||
| [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: | |||||
| Hardness results and efficient alternatives. Learning Theory and Kernel | |||||
| Machines, pages 129–143, 2003. | |||||
| """ | |||||
| import sys | |||||
| import time | |||||
| from collections import Counter | |||||
| from functools import partial | |||||
| import networkx as nx | |||||
| import numpy as np | |||||
| from gklearn.utils.utils import direct_product | |||||
| from gklearn.utils.graphdataset import get_dataset_attributes | |||||
| from gklearn.utils.parallel import parallel_gm | |||||
| def commonwalkkernel(*args, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| # n=None, | |||||
| weight=1, | |||||
| compute_method=None, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| verbose=True): | |||||
| """Calculate common walk graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| weight: integer | |||||
| Weight coefficient of different lengths of walks, which represents beta | |||||
| in 'exp' method and gamma in 'geo'. | |||||
| compute_method : string | |||||
| Method used to compute walk kernel. The Following choices are | |||||
| available: | |||||
| 'exp': method based on exponential serials applied on the direct | |||||
| product graph, as shown in reference [1]. The time complexity is O(n^6) | |||||
| for graphs with n vertices. | |||||
| 'geo': method based on geometric serials applied on the direct product | |||||
| graph, as shown in reference [1]. The time complexity is O(n^6) for | |||||
| graphs with n vertices. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is a common walk kernel between 2 | |||||
| graphs. | |||||
| """ | |||||
| # n : integer | |||||
| # Longest length of walks. Only useful when applying the 'brute' method. | |||||
| # 'brute': brute force, simply search for all walks and compare them. | |||||
| compute_method = compute_method.lower() | |||||
| # arrange all graphs in a list | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
| # remove graphs with only 1 node, as they do not have adjacency matrices | |||||
| len_gn = len(Gn) | |||||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] | |||||
| idx = [G[0] for G in Gn] | |||||
| Gn = [G[1] for G in Gn] | |||||
| if len(Gn) != len_gn: | |||||
| if verbose: | |||||
| print('\n %d graphs are removed as they have only 1 node.\n' % | |||||
| (len_gn - len(Gn))) | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| if not ds_attrs['is_directed']: # convert | |||||
| Gn = [G.to_directed() for G in Gn] | |||||
| start_time = time.time() | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| # direct product graph method - exponential | |||||
| if compute_method == 'exp': | |||||
| do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # direct product graph method - geometric | |||||
| elif compute_method == 'geo': | |||||
| do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # pool = Pool(n_jobs) | |||||
| # itr = zip(combinations_with_replacement(Gn, 2), | |||||
| # combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| # if len_itr < 1000 * n_jobs: | |||||
| # chunksize = int(len_itr / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 1000 | |||||
| # | |||||
| # # direct product graph method - exponential | |||||
| # if compute_method == 'exp': | |||||
| # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| # | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.imap_unordered(do_partial, itr, chunksize), | |||||
| # desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # # direct product graph method - exponential | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # if compute_method == 'exp': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # | |||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # # search all paths use brute force. | |||||
| # elif compute_method == 'brute': | |||||
| # n = int(n) | |||||
| # # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
| # all_walks = [ | |||||
| # find_all_walks_until_length(Gn[i], n, node_label, edge_label) | |||||
| # for i in range(0, len(Gn)) | |||||
| # ] | |||||
| # | |||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_brute( | |||||
| # all_walks[i], | |||||
| # all_walks[j], | |||||
| # node_label=node_label, | |||||
| # edge_label=edge_label) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time, idx | |||||
| def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | |||||
| """Calculate walk graph kernels up to n between 2 graphs using exponential | |||||
| series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| beta : integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| # print(A) | |||||
| # from matplotlib import pyplot as plt | |||||
| # nx.draw_networkx(G1) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(G2) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(gp) | |||||
| # plt.show() | |||||
| # print(G1.nodes(data=True)) | |||||
| # print(G2.nodes(data=True)) | |||||
| # print(gp.nodes(data=True)) | |||||
| # print(gp.edges(data=True)) | |||||
| ew, ev = np.linalg.eig(A) | |||||
| # print('ew: ', ew) | |||||
| # print(ev) | |||||
| # T = np.matrix(ev) | |||||
| # print('T: ', T) | |||||
| # T = ev.I | |||||
| D = np.zeros((len(ew), len(ew))) | |||||
| for i in range(len(ew)): | |||||
| D[i][i] = np.exp(beta * ew[i]) | |||||
| # print('D: ', D) | |||||
| # print('hshs: ', T.I * D * T) | |||||
| # print(np.exp(-2)) | |||||
| # print(D) | |||||
| # print(np.exp(weight * D)) | |||||
| # print(ev) | |||||
| # print(np.linalg.inv(ev)) | |||||
| exp_D = ev * D * ev.T | |||||
| # print(exp_D) | |||||
| # print(np.exp(weight * A)) | |||||
| # print('-------') | |||||
| return exp_D.sum() | |||||
| def wrapper_cw_exp(node_label, edge_label, beta, itr): | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) | |||||
| def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | |||||
| """Calculate common walk graph kernels up to n between 2 graphs using | |||||
| geometric series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| gamma: integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| mat = np.identity(len(A)) - gamma * A | |||||
| # try: | |||||
| return mat.I.sum() | |||||
| # except np.linalg.LinAlgError: | |||||
| # return np.nan | |||||
| def wrapper_cw_geo(node_label, edge_label, gama, itr): | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) | |||||
| def _commonwalkkernel_brute(walks1, | |||||
| walks2, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Calculate walk graph kernels up to n between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| walks1, walks2 : list | |||||
| List of walks in 2 graphs, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk is | |||||
| represented by a string consists of labels of nodes and edges on that | |||||
| walk. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Treelet Kernel between 2 graphs. | |||||
| """ | |||||
| counts_walks1 = dict(Counter(walks1)) | |||||
| counts_walks2 = dict(Counter(walks2)) | |||||
| all_walks = list(set(walks1 + walks2)) | |||||
| vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||||
| for walk in all_walks] | |||||
| vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||||
| for walk in all_walks] | |||||
| kernel = np.dot(vector1, vector2) | |||||
| return kernel | |||||
| # this method find walks repetively, it could be faster. | |||||
| def find_all_walks_until_length(G, | |||||
| length, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Find all walks with a certain maximum length in a graph. | |||||
| A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The maximum length of walks. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| walk : list | |||||
| List of walks retrieved, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk | |||||
| is represented by a string consists of labels of nodes and edges on | |||||
| that walk. | |||||
| """ | |||||
| all_walks = [] | |||||
| # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||||
| for i in range(0, length + 1): | |||||
| new_walks = find_all_walks(G, i) | |||||
| if new_walks == []: | |||||
| break | |||||
| all_walks.extend(new_walks) | |||||
| if labeled == True: # convert paths to strings | |||||
| walk_strs = [] | |||||
| for walk in all_walks: | |||||
| strlist = [ | |||||
| G.node[node][node_label] + | |||||
| G[node][walk[walk.index(node) + 1]][edge_label] | |||||
| for node in walk[:-1] | |||||
| ] | |||||
| walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||||
| return walk_strs | |||||
| return all_walks | |||||
| def find_walks(G, source_node, length): | |||||
| """Find all walks with a certain length those start from a source node. A | |||||
| recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all walks start. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| return [[source_node]] if length == 0 else \ | |||||
| [[source_node] + walk for neighbor in G[source_node] | |||||
| for walk in find_walks(G, neighbor, length - 1)] | |||||
| def find_all_walks(G, length): | |||||
| """Find all walks with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| all_walks = [] | |||||
| for node in G: | |||||
| all_walks.extend(find_walks(G, node, length)) | |||||
| # The following process is not carried out according to the original article | |||||
| # all_paths_r = [ path[::-1] for path in all_paths ] | |||||
| # # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
| # for idx, path in enumerate(all_paths[:-1]): | |||||
| # for path2 in all_paths_r[idx+1::]: | |||||
| # if path == path2: | |||||
| # all_paths[idx] = [] | |||||
| # break | |||||
| # return list(filter(lambda a: a != [], all_paths)) | |||||
| return all_walks | |||||