| @@ -0,0 +1,326 @@ | |||
| """ | |||
| @author: linlin | |||
| @references: | |||
| [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
| Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
| """ | |||
| import sys | |||
| import time | |||
| from itertools import product | |||
| from functools import partial | |||
| from multiprocessing import Pool | |||
| from tqdm import tqdm | |||
| import networkx as nx | |||
| import numpy as np | |||
| from gklearn.utils.utils import getSPGraph | |||
| from gklearn.utils.graphdataset import get_dataset_attributes | |||
| from gklearn.utils.parallel import parallel_gm | |||
| def spkernel(*args, | |||
| node_label='atom', | |||
| edge_weight=None, | |||
| node_kernels=None, | |||
| parallel='imap_unordered', | |||
| n_jobs=None, | |||
| chunksize=None, | |||
| verbose=True): | |||
| """Calculate shortest-path kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| G1, G2 : NetworkX graphs | |||
| Two graphs between which the kernel is calculated. | |||
| node_label : string | |||
| Node attribute used as label. The default node label is atom. | |||
| edge_weight : string | |||
| Edge attribute name corresponding to the edge weight. | |||
| node_kernels : dict | |||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||
| for both labels. The first 2 functions take two node labels as | |||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||
| dimension array (n_samples, n_features). Each function returns an | |||
| number as the kernel value. Ignored when nodes are unlabeled. | |||
| n_jobs : int | |||
| Number of jobs for parallelization. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||
| """ | |||
| # pre-process | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| weight = None | |||
| if edge_weight is None: | |||
| if verbose: | |||
| print('\n None edge weight specified. Set all weight to 1.\n') | |||
| else: | |||
| try: | |||
| some_weight = list( | |||
| nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||
| if isinstance(some_weight, (float, int)): | |||
| weight = edge_weight | |||
| else: | |||
| if verbose: | |||
| print( | |||
| '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||
| % edge_weight) | |||
| except: | |||
| if verbose: | |||
| print( | |||
| '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||
| % edge_weight) | |||
| ds_attrs = get_dataset_attributes( | |||
| Gn, | |||
| attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||
| node_label=node_label) | |||
| # remove graphs with no edges, as no sp can be found in their structures, | |||
| # so the kernel between such a graph and itself will be zero. | |||
| len_gn = len(Gn) | |||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||
| idx = [G[0] for G in Gn] | |||
| Gn = [G[1] for G in Gn] | |||
| if len(Gn) != len_gn: | |||
| if verbose: | |||
| print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||
| (len_gn - len(Gn))) | |||
| start_time = time.time() | |||
| if parallel == 'imap_unordered': | |||
| pool = Pool(n_jobs) | |||
| # get shortest path graphs of Gn | |||
| getsp_partial = partial(wrapper_getSPGraph, weight) | |||
| itr = zip(Gn, range(0, len(Gn))) | |||
| if chunksize is None: | |||
| if len(Gn) < 100 * n_jobs: | |||
| # # use default chunksize as pool.map when iterable is less than 100 | |||
| # chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||
| # if extra: | |||
| # chunksize += 1 | |||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| if verbose: | |||
| iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), | |||
| desc='getting sp graphs', file=sys.stdout) | |||
| else: | |||
| iterator = pool.imap_unordered(getsp_partial, itr, chunksize) | |||
| for i, g in iterator: | |||
| Gn[i] = g | |||
| pool.close() | |||
| pool.join() | |||
| elif parallel is None: | |||
| pass | |||
| # # ---- direct running, normally use single CPU core. ---- | |||
| # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): | |||
| # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) | |||
| # # ---- use pool.map to parallel ---- | |||
| # result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
| # for i in result_sp: | |||
| # Gn[i[0]] = i[1] | |||
| # or | |||
| # getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||
| # for i, g in tqdm( | |||
| # pool.map(getsp_partial, range(0, len(Gn))), | |||
| # desc='getting sp graphs', | |||
| # file=sys.stdout): | |||
| # Gn[i] = g | |||
| # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||
| # sp_ml = [0] * len(Gn) # shortest path matrices | |||
| # for i in result_sp: | |||
| # sp_ml[i[0]] = i[1] | |||
| # edge_x_g = [[] for i in range(len(sp_ml))] | |||
| # edge_y_g = [[] for i in range(len(sp_ml))] | |||
| # edge_w_g = [[] for i in range(len(sp_ml))] | |||
| # for idx, item in enumerate(sp_ml): | |||
| # for i1 in range(len(item)): | |||
| # for i2 in range(i1 + 1, len(item)): | |||
| # if item[i1, i2] != np.inf: | |||
| # edge_x_g[idx].append(i1) | |||
| # edge_y_g[idx].append(i2) | |||
| # edge_w_g[idx].append(item[i1, i2]) | |||
| # print(len(edge_x_g[0])) | |||
| # print(len(edge_y_g[0])) | |||
| # print(len(edge_w_g[0])) | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||
| def init_worker(gn_toshare): | |||
| global G_gn | |||
| G_gn = gn_toshare | |||
| do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
| # # ---- use pool.map to parallel. ---- | |||
| # # result_perf = pool.map(do_partial, itr) | |||
| # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # for i, j, kernel in tqdm( | |||
| # pool.map(do_partial, itr), desc='calculating kernels', | |||
| # file=sys.stdout): | |||
| # Kmatrix[i][j] = kernel | |||
| # Kmatrix[j][i] = kernel | |||
| # pool.close() | |||
| # pool.join() | |||
| # # ---- use joblib.Parallel to parallel and track progress. ---- | |||
| # result_perf = Parallel( | |||
| # n_jobs=n_jobs, verbose=10)( | |||
| # delayed(do_partial)(ij) | |||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
| # result_perf = [ | |||
| # do_partial(ij) | |||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # ] | |||
| # for i in result_perf: | |||
| # Kmatrix[i[0]][i[1]] = i[2] | |||
| # Kmatrix[i[1]][i[0]] = i[2] | |||
| # # ---- direct running, normally use single CPU core. ---- | |||
| # from itertools import combinations_with_replacement | |||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
| # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) | |||
| # Kmatrix[i][j] = kernel | |||
| # Kmatrix[j][i] = kernel | |||
| run_time = time.time() - start_time | |||
| if verbose: | |||
| print( | |||
| "\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
| % (len(Gn), run_time)) | |||
| return Kmatrix, run_time, idx | |||
| def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||
| kernel = 0 | |||
| # compute shortest path matrices first, method borrowed from FCSP. | |||
| vk_dict = {} # shortest path matrices dict | |||
| if ds_attrs['node_labeled']: | |||
| # node symb and non-synb labeled | |||
| if ds_attrs['node_attr_dim'] > 0: | |||
| kn = node_kernels['mix'] | |||
| for n1, n2 in product( | |||
| g1.nodes(data=True), g2.nodes(data=True)): | |||
| vk_dict[(n1[0], n2[0])] = kn( | |||
| n1[1][node_label], n2[1][node_label], | |||
| n1[1]['attributes'], n2[1]['attributes']) | |||
| # node symb labeled | |||
| else: | |||
| kn = node_kernels['symb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||
| n2[1][node_label]) | |||
| else: | |||
| # node non-synb labeled | |||
| if ds_attrs['node_attr_dim'] > 0: | |||
| kn = node_kernels['nsymb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||
| n2[1]['attributes']) | |||
| # node unlabeled | |||
| else: | |||
| for e1, e2 in product( | |||
| g1.edges(data=True), g2.edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kernel += 1 | |||
| return kernel | |||
| # compute graph kernels | |||
| if ds_attrs['is_directed']: | |||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||
| e2[1])] | |||
| kn1 = nk11 * nk22 | |||
| kernel += kn1 | |||
| else: | |||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
| e1[0], e2[1])], vk_dict[(e1[1], | |||
| e2[0])], vk_dict[(e1[1], | |||
| e2[1])] | |||
| kn1 = nk11 * nk22 | |||
| kn2 = nk12 * nk21 | |||
| kernel += kn1 + kn2 | |||
| # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||
| # # compute vertex kernels | |||
| # try: | |||
| # vk_mat = np.zeros((nx.number_of_nodes(g1), | |||
| # nx.number_of_nodes(g2))) | |||
| # g1nl = enumerate(g1.nodes(data=True)) | |||
| # g2nl = enumerate(g2.nodes(data=True)) | |||
| # for i1, n1 in g1nl: | |||
| # for i2, n2 in g2nl: | |||
| # vk_mat[i1][i2] = kn( | |||
| # n1[1][node_label], n2[1][node_label], | |||
| # [n1[1]['attributes']], [n2[1]['attributes']]) | |||
| # range1 = range(0, len(edge_w_g[i])) | |||
| # range2 = range(0, len(edge_w_g[j])) | |||
| # for i1 in range1: | |||
| # x1 = edge_x_g[i][i1] | |||
| # y1 = edge_y_g[i][i1] | |||
| # w1 = edge_w_g[i][i1] | |||
| # for i2 in range2: | |||
| # x2 = edge_x_g[j][i2] | |||
| # y2 = edge_y_g[j][i2] | |||
| # w2 = edge_w_g[j][i2] | |||
| # ke = (w1 == w2) | |||
| # if ke > 0: | |||
| # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||
| # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||
| # kernel += kn1 + kn2 | |||
| return kernel | |||
| def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): | |||
| i = itr[0] | |||
| j = itr[1] | |||
| return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||
| #def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | |||
| # g1 = itr_item[0][0] | |||
| # g2 = itr_item[0][1] | |||
| # i = itr_item[1][0] | |||
| # j = itr_item[1][1] | |||
| # return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||
| def wrapper_getSPGraph(weight, itr_item): | |||
| g = itr_item[0] | |||
| i = itr_item[1] | |||
| return i, getSPGraph(g, edge_weight=weight) | |||
| # return i, nx.floyd_warshall_numpy(g, weight=weight) | |||