| @@ -3,9 +3,9 @@ | |||||
| @references: | @references: | ||||
| [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: | |||||
| Hardness results and efficient alternatives. Learning Theory and Kernel | |||||
| Machines, pages 129–143, 2003. | |||||
| [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: | |||||
| Hardness results and efficient alternatives. Learning Theory and Kernel | |||||
| Machines, pages 129–143, 2003. | |||||
| """ | """ | ||||
| import sys | import sys | ||||
| @@ -22,428 +22,429 @@ from gklearn.utils.parallel import parallel_gm | |||||
| def commonwalkkernel(*args, | def commonwalkkernel(*args, | ||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| # n=None, | |||||
| weight=1, | |||||
| compute_method=None, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Calculate common walk graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| weight: integer | |||||
| Weight coefficient of different lengths of walks, which represents beta | |||||
| in 'exp' method and gamma in 'geo'. | |||||
| compute_method : string | |||||
| Method used to compute walk kernel. The Following choices are | |||||
| available: | |||||
| 'exp': method based on exponential serials applied on the direct | |||||
| product graph, as shown in reference [1]. The time complexity is O(n^6) | |||||
| for graphs with n vertices. | |||||
| 'geo': method based on geometric serials applied on the direct product | |||||
| graph, as shown in reference [1]. The time complexity is O(n^6) for | |||||
| graphs with n vertices. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is a common walk kernel between 2 | |||||
| graphs. | |||||
| """ | |||||
| # n : integer | |||||
| # Longest length of walks. Only useful when applying the 'brute' method. | |||||
| # 'brute': brute force, simply search for all walks and compare them. | |||||
| compute_method = compute_method.lower() | |||||
| # arrange all graphs in a list | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
| # remove graphs with only 1 node, as they do not have adjacency matrices | |||||
| len_gn = len(Gn) | |||||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] | |||||
| idx = [G[0] for G in Gn] | |||||
| Gn = [G[1] for G in Gn] | |||||
| if len(Gn) != len_gn: | |||||
| if verbose: | |||||
| print('\n %d graphs are removed as they have only 1 node.\n' % | |||||
| (len_gn - len(Gn))) | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| if not ds_attrs['is_directed']: # convert | |||||
| Gn = [G.to_directed() for G in Gn] | |||||
| start_time = time.time() | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| # direct product graph method - exponential | |||||
| if compute_method == 'exp': | |||||
| do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # direct product graph method - geometric | |||||
| elif compute_method == 'geo': | |||||
| do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||||
| # pool = Pool(n_jobs) | |||||
| # itr = zip(combinations_with_replacement(Gn, 2), | |||||
| # combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| # if len_itr < 1000 * n_jobs: | |||||
| # chunksize = int(len_itr / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 1000 | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| # n=None, | |||||
| weight=1, | |||||
| compute_method=None, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| verbose=True): | |||||
| """Calculate common walk graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| weight: integer | |||||
| Weight coefficient of different lengths of walks, which represents beta | |||||
| in 'exp' method and gamma in 'geo'. | |||||
| compute_method : string | |||||
| Method used to compute walk kernel. The Following choices are | |||||
| available: | |||||
| 'exp': method based on exponential serials applied on the direct | |||||
| product graph, as shown in reference [1]. The time complexity is O(n^6) | |||||
| for graphs with n vertices. | |||||
| 'geo': method based on geometric serials applied on the direct product | |||||
| graph, as shown in reference [1]. The time complexity is O(n^6) for | |||||
| graphs with n vertices. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is a common walk kernel between 2 | |||||
| graphs. | |||||
| """ | |||||
| # n : integer | |||||
| # Longest length of walks. Only useful when applying the 'brute' method. | |||||
| # 'brute': brute force, simply search for all walks and compare them. | |||||
| compute_method = compute_method.lower() | |||||
| # arrange all graphs in a list | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
| # remove graphs with only 1 node, as they do not have adjacency matrices | |||||
| len_gn = len(Gn) | |||||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] | |||||
| idx = [G[0] for G in Gn] | |||||
| Gn = [G[1] for G in Gn] | |||||
| if len(Gn) != len_gn: | |||||
| if verbose: | |||||
| print('\n %d graphs are removed as they have only 1 node.\n' % | |||||
| (len_gn - len(Gn))) | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| if not ds_attrs['is_directed']: # convert | |||||
| Gn = [G.to_directed() for G in Gn] | |||||
| start_time = time.time() | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| # direct product graph method - exponential | |||||
| if compute_method == 'exp': | |||||
| do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # direct product graph method - geometric | |||||
| elif compute_method == 'geo': | |||||
| do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # pool = Pool(n_jobs) | |||||
| # itr = zip(combinations_with_replacement(Gn, 2), | |||||
| # combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| # if len_itr < 1000 * n_jobs: | |||||
| # chunksize = int(len_itr / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 1000 | |||||
| # | # | ||||
| # # direct product graph method - exponential | |||||
| # if compute_method == 'exp': | |||||
| # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| # # direct product graph method - exponential | |||||
| # if compute_method == 'exp': | |||||
| # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| # | # | ||||
| # for i, j, kernel in tqdm( | |||||
| # pool.imap_unordered(do_partial, itr, chunksize), | |||||
| # desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # # direct product graph method - exponential | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # if compute_method == 'exp': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.imap_unordered(do_partial, itr, chunksize), | |||||
| # desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # # direct product graph method - exponential | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # if compute_method == 'exp': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # | # | ||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # # search all paths use brute force. | |||||
| # elif compute_method == 'brute': | |||||
| # n = int(n) | |||||
| # # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
| # all_walks = [ | |||||
| # find_all_walks_until_length(Gn[i], n, node_label, edge_label) | |||||
| # for i in range(0, len(Gn)) | |||||
| # ] | |||||
| # # direct product graph method - geometric | |||||
| # elif compute_method == 'geo': | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, | |||||
| # edge_label, weight) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # # search all paths use brute force. | |||||
| # elif compute_method == 'brute': | |||||
| # n = int(n) | |||||
| # # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
| # all_walks = [ | |||||
| # find_all_walks_until_length(Gn[i], n, node_label, edge_label) | |||||
| # for i in range(0, len(Gn)) | |||||
| # ] | |||||
| # | # | ||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_brute( | |||||
| # all_walks[i], | |||||
| # all_walks[j], | |||||
| # node_label=node_label, | |||||
| # edge_label=edge_label) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| # Kmatrix[i][j] = _commonwalkkernel_brute( | |||||
| # all_walks[i], | |||||
| # all_walks[j], | |||||
| # node_label=node_label, | |||||
| # edge_label=edge_label) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time, idx | |||||
| return Kmatrix, run_time, idx | |||||
| def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | ||||
| """Calculate walk graph kernels up to n between 2 graphs using exponential | |||||
| series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| beta : integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| # print(A) | |||||
| # from matplotlib import pyplot as plt | |||||
| # nx.draw_networkx(G1) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(G2) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(gp) | |||||
| # plt.show() | |||||
| # print(G1.nodes(data=True)) | |||||
| # print(G2.nodes(data=True)) | |||||
| # print(gp.nodes(data=True)) | |||||
| # print(gp.edges(data=True)) | |||||
| ew, ev = np.linalg.eig(A) | |||||
| # print('ew: ', ew) | |||||
| # print(ev) | |||||
| # T = np.matrix(ev) | |||||
| # print('T: ', T) | |||||
| # T = ev.I | |||||
| D = np.zeros((len(ew), len(ew))) | |||||
| for i in range(len(ew)): | |||||
| D[i][i] = np.exp(beta * ew[i]) | |||||
| # print('D: ', D) | |||||
| # print('hshs: ', T.I * D * T) | |||||
| # print(np.exp(-2)) | |||||
| # print(D) | |||||
| # print(np.exp(weight * D)) | |||||
| # print(ev) | |||||
| # print(np.linalg.inv(ev)) | |||||
| exp_D = ev * D * ev.T | |||||
| # print(exp_D) | |||||
| # print(np.exp(weight * A)) | |||||
| # print('-------') | |||||
| return exp_D.sum() | |||||
| """Calculate walk graph kernels up to n between 2 graphs using exponential | |||||
| series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| beta : integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| # print(A) | |||||
| # from matplotlib import pyplot as plt | |||||
| # nx.draw_networkx(G1) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(G2) | |||||
| # plt.show() | |||||
| # nx.draw_networkx(gp) | |||||
| # plt.show() | |||||
| # print(G1.nodes(data=True)) | |||||
| # print(G2.nodes(data=True)) | |||||
| # print(gp.nodes(data=True)) | |||||
| # print(gp.edges(data=True)) | |||||
| ew, ev = np.linalg.eig(A) | |||||
| # print('ew: ', ew) | |||||
| # print(ev) | |||||
| # T = np.matrix(ev) | |||||
| # print('T: ', T) | |||||
| # T = ev.I | |||||
| D = np.zeros((len(ew), len(ew))) | |||||
| for i in range(len(ew)): | |||||
| D[i][i] = np.exp(beta * ew[i]) | |||||
| # print('D: ', D) | |||||
| # print('hshs: ', T.I * D * T) | |||||
| # print(np.exp(-2)) | |||||
| # print(D) | |||||
| # print(np.exp(weight * D)) | |||||
| # print(ev) | |||||
| # print(np.linalg.inv(ev)) | |||||
| exp_D = ev * D * ev.T | |||||
| # print(exp_D) | |||||
| # print(np.exp(weight * A)) | |||||
| # print('-------') | |||||
| return exp_D.sum() | |||||
| def wrapper_cw_exp(node_label, edge_label, beta, itr): | def wrapper_cw_exp(node_label, edge_label, beta, itr): | ||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) | |||||
| def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | ||||
| """Calculate common walk graph kernels up to n between 2 graphs using | |||||
| geometric series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| gamma: integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| mat = np.identity(len(A)) - gamma * A | |||||
| # try: | |||||
| return mat.I.sum() | |||||
| # except np.linalg.LinAlgError: | |||||
| # return np.nan | |||||
| """Calculate common walk graph kernels up to n between 2 graphs using | |||||
| geometric series. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. | |||||
| edge_label : string | |||||
| Edge attribute used as label. | |||||
| gamma: integer | |||||
| Weight. | |||||
| ij : tuple of integer | |||||
| Index of graphs between which the kernel is computed. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| The common walk Kernel between 2 graphs. | |||||
| """ | |||||
| # get tensor product / direct product | |||||
| gp = direct_product(g1, g2, node_label, edge_label) | |||||
| # return 0 if the direct product graph have no more than 1 node. | |||||
| if nx.number_of_nodes(gp) < 2: | |||||
| return 0 | |||||
| A = nx.adjacency_matrix(gp).todense() | |||||
| mat = np.identity(len(A)) - gamma * A | |||||
| # try: | |||||
| return mat.I.sum() | |||||
| # except np.linalg.LinAlgError: | |||||
| # return np.nan | |||||
| def wrapper_cw_geo(node_label, edge_label, gama, itr): | def wrapper_cw_geo(node_label, edge_label, gama, itr): | ||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) | |||||
| def _commonwalkkernel_brute(walks1, | def _commonwalkkernel_brute(walks1, | ||||
| walks2, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Calculate walk graph kernels up to n between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| walks1, walks2 : list | |||||
| List of walks in 2 graphs, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk is | |||||
| represented by a string consists of labels of nodes and edges on that | |||||
| walk. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Treelet Kernel between 2 graphs. | |||||
| """ | |||||
| counts_walks1 = dict(Counter(walks1)) | |||||
| counts_walks2 = dict(Counter(walks2)) | |||||
| all_walks = list(set(walks1 + walks2)) | |||||
| vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||||
| for walk in all_walks] | |||||
| vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||||
| for walk in all_walks] | |||||
| kernel = np.dot(vector1, vector2) | |||||
| return kernel | |||||
| walks2, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Calculate walk graph kernels up to n between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| walks1, walks2 : list | |||||
| List of walks in 2 graphs, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk is | |||||
| represented by a string consists of labels of nodes and edges on that | |||||
| walk. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Treelet Kernel between 2 graphs. | |||||
| """ | |||||
| counts_walks1 = dict(Counter(walks1)) | |||||
| counts_walks2 = dict(Counter(walks2)) | |||||
| all_walks = list(set(walks1 + walks2)) | |||||
| vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||||
| for walk in all_walks] | |||||
| vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||||
| for walk in all_walks] | |||||
| kernel = np.dot(vector1, vector2) | |||||
| return kernel | |||||
| # this method find walks repetively, it could be faster. | # this method find walks repetively, it could be faster. | ||||
| def find_all_walks_until_length(G, | def find_all_walks_until_length(G, | ||||
| length, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Find all walks with a certain maximum length in a graph. | |||||
| A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The maximum length of walks. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| walk : list | |||||
| List of walks retrieved, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk | |||||
| is represented by a string consists of labels of nodes and edges on | |||||
| that walk. | |||||
| """ | |||||
| all_walks = [] | |||||
| # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||||
| for i in range(0, length + 1): | |||||
| new_walks = find_all_walks(G, i) | |||||
| if new_walks == []: | |||||
| break | |||||
| all_walks.extend(new_walks) | |||||
| if labeled == True: # convert paths to strings | |||||
| walk_strs = [] | |||||
| for walk in all_walks: | |||||
| strlist = [ | |||||
| G.node[node][node_label] + | |||||
| G[node][walk[walk.index(node) + 1]][edge_label] | |||||
| for node in walk[:-1] | |||||
| ] | |||||
| walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||||
| return walk_strs | |||||
| return all_walks | |||||
| length, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| labeled=True): | |||||
| """Find all walks with a certain maximum length in a graph. | |||||
| A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The maximum length of walks. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| walk : list | |||||
| List of walks retrieved, where for unlabeled graphs, each walk is | |||||
| represented by a list of nodes; while for labeled graphs, each walk | |||||
| is represented by a string consists of labels of nodes and edges on | |||||
| that walk. | |||||
| """ | |||||
| all_walks = [] | |||||
| # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||||
| for i in range(0, length + 1): | |||||
| new_walks = find_all_walks(G, i) | |||||
| if new_walks == []: | |||||
| break | |||||
| all_walks.extend(new_walks) | |||||
| if labeled == True: # convert paths to strings | |||||
| walk_strs = [] | |||||
| for walk in all_walks: | |||||
| strlist = [ | |||||
| G.node[node][node_label] + | |||||
| G[node][walk[walk.index(node) + 1]][edge_label] | |||||
| for node in walk[:-1] | |||||
| ] | |||||
| walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||||
| return walk_strs | |||||
| return all_walks | |||||
| def find_walks(G, source_node, length): | def find_walks(G, source_node, length): | ||||
| """Find all walks with a certain length those start from a source node. A | |||||
| recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all walks start. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| return [[source_node]] if length == 0 else \ | |||||
| [[source_node] + walk for neighbor in G[source_node] | |||||
| for walk in find_walks(G, neighbor, length - 1)] | |||||
| """Find all walks with a certain length those start from a source node. A | |||||
| recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all walks start. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| return [[source_node]] if length == 0 else \ | |||||
| [[source_node] + walk for neighbor in G[source_node] | |||||
| for walk in find_walks(G, neighbor, length - 1)] | |||||
| def find_all_walks(G, length): | def find_all_walks(G, length): | ||||
| """Find all walks with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| all_walks = [] | |||||
| for node in G: | |||||
| all_walks.extend(find_walks(G, node, length)) | |||||
| # The following process is not carried out according to the original article | |||||
| # all_paths_r = [ path[::-1] for path in all_paths ] | |||||
| # # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
| # for idx, path in enumerate(all_paths[:-1]): | |||||
| # for path2 in all_paths_r[idx+1::]: | |||||
| # if path == path2: | |||||
| # all_paths[idx] = [] | |||||
| # break | |||||
| # return list(filter(lambda a: a != [], all_paths)) | |||||
| return all_walks | |||||
| """Find all walks with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which walks are searched. | |||||
| length : integer | |||||
| The length of walks. | |||||
| Return | |||||
| ------ | |||||
| walk : list of list | |||||
| List of walks retrieved, where each walk is represented by a list of | |||||
| nodes. | |||||
| """ | |||||
| all_walks = [] | |||||
| for node in G: | |||||
| all_walks.extend(find_walks(G, node, length)) | |||||
| # The following process is not carried out according to the original article | |||||
| # all_paths_r = [ path[::-1] for path in all_paths ] | |||||
| # # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
| # for idx, path in enumerate(all_paths[:-1]): | |||||
| # for path2 in all_paths_r[idx+1::]: | |||||
| # if path == path2: | |||||
| # all_paths[idx] = [] | |||||
| # break | |||||
| # return list(filter(lambda a: a != [], all_paths)) | |||||
| return all_walks | |||||
| @@ -3,14 +3,14 @@ | |||||
| @references: | @references: | ||||
| [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
| labeled graphs. In Proceedings of the 20th International Conference on | |||||
| Machine Learning, Washington, DC, United States, 2003. | |||||
| [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||||
| Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||||
| Proceedings of the twenty-first international conference on Machine | |||||
| learning, page 70. ACM, 2004. | |||||
| [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
| labeled graphs. In Proceedings of the 20th International Conference on | |||||
| Machine Learning, Washington, DC, United States, 2003. | |||||
| [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||||
| Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||||
| Proceedings of the twenty-first international conference on Machine | |||||
| learning, page 70. ACM, 2004. | |||||
| """ | """ | ||||
| import sys | import sys | ||||
| @@ -31,275 +31,277 @@ from gklearn.utils.parallel import parallel_gm | |||||
| def marginalizedkernel(*args, | def marginalizedkernel(*args, | ||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| p_quit=0.5, | |||||
| n_iteration=20, | |||||
| remove_totters=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Calculate marginalized graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| p_quit : integer | |||||
| The termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| Time of iterations to calculate R_inf. | |||||
| remove_totters : boolean | |||||
| Whether to remove totterings by method introduced in [2]. The default | |||||
| value is False. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the marginalized kernel between | |||||
| 2 praphs. | |||||
| """ | |||||
| # pre-process | |||||
| n_iteration = int(n_iteration) | |||||
| Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||||
| Gn = [g.copy() for g in Gn] | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled'] or node_label == None: | |||||
| node_label = 'atom' | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled'] or edge_label == None: | |||||
| edge_label = 'bond_type' | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| start_time = time.time() | |||||
| if remove_totters: | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| pool = Pool(n_jobs) | |||||
| untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| for i, g in tqdm( | |||||
| pool.imap_unordered( | |||||
| untotter_partial, range(0, len(Gn)), chunksize), | |||||
| desc='removing tottering', | |||||
| file=sys.stdout): | |||||
| Gn[i] = g | |||||
| pool.close() | |||||
| pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # Gn = [ | |||||
| # untotterTransformation(G, node_label, edge_label) | |||||
| # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) | |||||
| # ] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| do_partial = partial(wrapper_marg_do, node_label, edge_label, | |||||
| p_quit, n_iteration) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| ## pbar = tqdm( | |||||
| ## total=(1 + len(Gn)) * len(Gn) / 2, | |||||
| ## desc='calculating kernels', | |||||
| ## file=sys.stdout) | |||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| ## print(i, j) | |||||
| # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | |||||
| # edge_label, p_quit, n_iteration) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| ## pbar.update(1) | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| p_quit=0.5, | |||||
| n_iteration=20, | |||||
| remove_totters=False, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| verbose=True): | |||||
| """Calculate marginalized graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| p_quit : integer | |||||
| The termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| Time of iterations to calculate R_inf. | |||||
| remove_totters : boolean | |||||
| Whether to remove totterings by method introduced in [2]. The default | |||||
| value is False. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the marginalized kernel between | |||||
| 2 praphs. | |||||
| """ | |||||
| # pre-process | |||||
| n_iteration = int(n_iteration) | |||||
| Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||||
| Gn = [g.copy() for g in Gn] | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled'] or node_label == None: | |||||
| node_label = 'atom' | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled'] or edge_label == None: | |||||
| edge_label = 'bond_type' | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| start_time = time.time() | |||||
| if remove_totters: | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| pool = Pool(n_jobs) | |||||
| untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) | |||||
| if chunksize is None: | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| for i, g in tqdm( | |||||
| pool.imap_unordered( | |||||
| untotter_partial, range(0, len(Gn)), chunksize), | |||||
| desc='removing tottering', | |||||
| file=sys.stdout): | |||||
| Gn[i] = g | |||||
| pool.close() | |||||
| pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # Gn = [ | |||||
| # untotterTransformation(G, node_label, edge_label) | |||||
| # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) | |||||
| # ] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| do_partial = partial(wrapper_marg_do, node_label, edge_label, | |||||
| p_quit, n_iteration) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| ## pbar = tqdm( | |||||
| ## total=(1 + len(Gn)) * len(Gn) / 2, | |||||
| ## desc='calculating kernels', | |||||
| ## file=sys.stdout) | |||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| ## print(i, j) | |||||
| # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | |||||
| # edge_label, p_quit, n_iteration) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| ## pbar.update(1) | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time | |||||
| def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | ||||
| """Calculate marginalized graph kernel between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| G1, G2 : NetworkX graphs | |||||
| 2 graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| p_quit : integer | |||||
| the termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| time of iterations to calculate R_inf. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Marginalized Kernel between 2 graphs. | |||||
| """ | |||||
| # init parameters | |||||
| kernel = 0 | |||||
| num_nodes_G1 = nx.number_of_nodes(g1) | |||||
| num_nodes_G2 = nx.number_of_nodes(g2) | |||||
| # the initial probability distribution in the random walks generating step | |||||
| # (uniform distribution over |G|) | |||||
| p_init_G1 = 1 / num_nodes_G1 | |||||
| p_init_G2 = 1 / num_nodes_G2 | |||||
| q = p_quit * p_quit | |||||
| r1 = q | |||||
| # # initial R_inf | |||||
| # # matrix to save all the R_inf for all pairs of nodes | |||||
| # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| """Calculate marginalized graph kernel between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| G1, G2 : NetworkX graphs | |||||
| 2 graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| p_quit : integer | |||||
| the termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| time of iterations to calculate R_inf. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Marginalized Kernel between 2 graphs. | |||||
| """ | |||||
| # init parameters | |||||
| kernel = 0 | |||||
| num_nodes_G1 = nx.number_of_nodes(g1) | |||||
| num_nodes_G2 = nx.number_of_nodes(g2) | |||||
| # the initial probability distribution in the random walks generating step | |||||
| # (uniform distribution over |G|) | |||||
| p_init_G1 = 1 / num_nodes_G1 | |||||
| p_init_G2 = 1 / num_nodes_G2 | |||||
| q = p_quit * p_quit | |||||
| r1 = q | |||||
| # # initial R_inf | |||||
| # # matrix to save all the R_inf for all pairs of nodes | |||||
| # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| # | # | ||||
| # # calculate R_inf with a simple interative method | |||||
| # for i in range(1, n_iteration): | |||||
| # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| # R_inf_new.fill(r1) | |||||
| # # calculate R_inf with a simple interative method | |||||
| # for i in range(1, n_iteration): | |||||
| # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| # R_inf_new.fill(r1) | |||||
| # | # | ||||
| # # calculate R_inf for each pair of nodes | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # neighbor_n1 = g1[node1[0]] | |||||
| # # the transition probability distribution in the random walks | |||||
| # # generating step (uniform distribution over the vertices adjacent | |||||
| # # to the current vertex) | |||||
| # if len(neighbor_n1) > 0: | |||||
| # p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # neighbor_n2 = g2[node2[0]] | |||||
| # if len(neighbor_n2) > 0: | |||||
| # p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| # | |||||
| # for neighbor1 in neighbor_n1: | |||||
| # for neighbor2 in neighbor_n2: | |||||
| # t = p_trans_n1 * p_trans_n2 * \ | |||||
| # deltakernel(g1.node[neighbor1][node_label], | |||||
| # g2.node[neighbor2][node_label]) * \ | |||||
| # deltakernel( | |||||
| # neighbor_n1[neighbor1][edge_label], | |||||
| # neighbor_n2[neighbor2][edge_label]) | |||||
| # | |||||
| # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||||
| # neighbor2] # ref [1] equation (8) | |||||
| # R_inf[:] = R_inf_new | |||||
| # # calculate R_inf for each pair of nodes | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # neighbor_n1 = g1[node1[0]] | |||||
| # # the transition probability distribution in the random walks | |||||
| # # generating step (uniform distribution over the vertices adjacent | |||||
| # # to the current vertex) | |||||
| # if len(neighbor_n1) > 0: | |||||
| # p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # neighbor_n2 = g2[node2[0]] | |||||
| # if len(neighbor_n2) > 0: | |||||
| # p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| # | |||||
| # for neighbor1 in neighbor_n1: | |||||
| # for neighbor2 in neighbor_n2: | |||||
| # t = p_trans_n1 * p_trans_n2 * \ | |||||
| # deltakernel(g1.node[neighbor1][node_label], | |||||
| # g2.node[neighbor2][node_label]) * \ | |||||
| # deltakernel( | |||||
| # neighbor_n1[neighbor1][edge_label], | |||||
| # neighbor_n2[neighbor2][edge_label]) | |||||
| # | |||||
| # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||||
| # neighbor2] # ref [1] equation (8) | |||||
| # R_inf[:] = R_inf_new | |||||
| # | # | ||||
| # # add elements of R_inf up and calculate kernel | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| # node1[1][node_label], node2[1][node_label]) | |||||
| # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||||
| R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||||
| # initial R_inf, the 1st iteration. | |||||
| for node1 in g1.nodes(): | |||||
| for node2 in g2.nodes(): | |||||
| # R_inf[(node1[0], node2[0])] = r1 | |||||
| if len(g1[node1]) > 0: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| else: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| R_inf[(node1, node2)] = 1 | |||||
| # compute all transition probability first. | |||||
| t_dict = {} | |||||
| if n_iteration > 1: | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||||
| p_trans_n1 * p_trans_n2 * \ | |||||
| deltakernel(g1.nodes[neighbor1][node_label], | |||||
| g2.nodes[neighbor2][node_label]) * \ | |||||
| deltakernel( | |||||
| neighbor_n1[neighbor1][edge_label], | |||||
| neighbor_n2[neighbor2][edge_label]) | |||||
| # calculate R_inf with a simple interative method | |||||
| for i in range(2, n_iteration + 1): | |||||
| R_inf_old = R_inf.copy() | |||||
| # calculate R_inf for each pair of nodes | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| R_inf[(node1, node2)] += \ | |||||
| (t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||||
| R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||||
| # add elements of R_inf up and calculate kernel | |||||
| for (n1, n2), value in R_inf.items(): | |||||
| s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| g1.nodes[n1][node_label], g2.nodes[n2][node_label]) | |||||
| kernel += s * value # ref [1] equation (6) | |||||
| return kernel | |||||
| # # add elements of R_inf up and calculate kernel | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| # node1[1][node_label], node2[1][node_label]) | |||||
| # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||||
| R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||||
| # initial R_inf, the 1st iteration. | |||||
| for node1 in g1.nodes(): | |||||
| for node2 in g2.nodes(): | |||||
| # R_inf[(node1[0], node2[0])] = r1 | |||||
| if len(g1[node1]) > 0: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| else: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| R_inf[(node1, node2)] = 1 | |||||
| # compute all transition probability first. | |||||
| t_dict = {} | |||||
| if n_iteration > 1: | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||||
| p_trans_n1 * p_trans_n2 * \ | |||||
| deltakernel(g1.nodes[neighbor1][node_label], | |||||
| g2.nodes[neighbor2][node_label]) * \ | |||||
| deltakernel( | |||||
| neighbor_n1[neighbor1][edge_label], | |||||
| neighbor_n2[neighbor2][edge_label]) | |||||
| # calculate R_inf with a simple interative method | |||||
| for i in range(2, n_iteration + 1): | |||||
| R_inf_old = R_inf.copy() | |||||
| # calculate R_inf for each pair of nodes | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| R_inf[(node1, node2)] += \ | |||||
| (t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||||
| R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||||
| # add elements of R_inf up and calculate kernel | |||||
| for (n1, n2), value in R_inf.items(): | |||||
| s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| g1.nodes[n1][node_label], g2.nodes[n2][node_label]) | |||||
| kernel += s * value # ref [1] equation (6) | |||||
| return kernel | |||||
| def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): | def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): | ||||
| i= itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) | |||||
| i= itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) | |||||
| def wrapper_untotter(Gn, node_label, edge_label, i): | def wrapper_untotter(Gn, node_label, edge_label, i): | ||||
| return i, untotterTransformation(Gn[i], node_label, edge_label) | |||||
| return i, untotterTransformation(Gn[i], node_label, edge_label) | |||||
| @@ -373,8 +373,18 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None | |||||
| for key in all_paths] | for key in all_paths] | ||||
| kernel = np.sum(np.minimum(vector1, vector2)) / \ | kernel = np.sum(np.minimum(vector1, vector2)) / \ | ||||
| np.sum(np.maximum(vector1, vector2)) | np.sum(np.maximum(vector1, vector2)) | ||||
| elif self.__k_func is None: # no sub-kernel used; compare paths directly. | |||||
| path_count1 = Counter(paths1) | |||||
| path_count2 = Counter(paths2) | |||||
| vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) | |||||
| for key in all_paths] | |||||
| vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) | |||||
| for key in all_paths] | |||||
| kernel = np.dot(vector1, vector2) | |||||
| else: | else: | ||||
| raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') | |||||
| raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.') | |||||
| return kernel | return kernel | ||||
| @@ -2,9 +2,9 @@ | |||||
| @author: linlin | @author: linlin | ||||
| @references: | @references: | ||||
| [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||||
| Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||||
| [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||||
| Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||||
| """ | """ | ||||
| import sys | import sys | ||||
| @@ -22,303 +22,305 @@ from gklearn.utils.graphdataset import get_dataset_attributes | |||||
| from gklearn.utils.parallel import parallel_gm | from gklearn.utils.parallel import parallel_gm | ||||
| def spkernel(*args, | def spkernel(*args, | ||||
| node_label='atom', | |||||
| edge_weight=None, | |||||
| node_kernels=None, | |||||
| parallel='imap_unordered', | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Calculate shortest-path kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. The default node label is atom. | |||||
| edge_weight : string | |||||
| Edge attribute name corresponding to the edge weight. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns an | |||||
| number as the kernel value. Ignored when nodes are unlabeled. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||||
| """ | |||||
| # pre-process | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
| Gn = [g.copy() for g in Gn] | |||||
| weight = None | |||||
| if edge_weight is None: | |||||
| if verbose: | |||||
| print('\n None edge weight specified. Set all weight to 1.\n') | |||||
| else: | |||||
| try: | |||||
| some_weight = list( | |||||
| nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
| if isinstance(some_weight, (float, int)): | |||||
| weight = edge_weight | |||||
| else: | |||||
| if verbose: | |||||
| print( | |||||
| '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
| % edge_weight) | |||||
| except: | |||||
| if verbose: | |||||
| print( | |||||
| '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
| % edge_weight) | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||||
| node_label=node_label) | |||||
| # remove graphs with no edges, as no sp can be found in their structures, | |||||
| # so the kernel between such a graph and itself will be zero. | |||||
| len_gn = len(Gn) | |||||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||||
| idx = [G[0] for G in Gn] | |||||
| Gn = [G[1] for G in Gn] | |||||
| if len(Gn) != len_gn: | |||||
| if verbose: | |||||
| print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||||
| (len_gn - len(Gn))) | |||||
| start_time = time.time() | |||||
| if parallel == 'imap_unordered': | |||||
| pool = Pool(n_jobs) | |||||
| # get shortest path graphs of Gn | |||||
| getsp_partial = partial(wrapper_getSPGraph, weight) | |||||
| itr = zip(Gn, range(0, len(Gn))) | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| # # use default chunksize as pool.map when iterable is less than 100 | |||||
| # chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||||
| # if extra: | |||||
| # chunksize += 1 | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| if verbose: | |||||
| iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
| desc='getting sp graphs', file=sys.stdout) | |||||
| else: | |||||
| iterator = pool.imap_unordered(getsp_partial, itr, chunksize) | |||||
| for i, g in iterator: | |||||
| Gn[i] = g | |||||
| pool.close() | |||||
| pool.join() | |||||
| elif parallel is None: | |||||
| pass | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): | |||||
| # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) | |||||
| # # ---- use pool.map to parallel ---- | |||||
| # result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||||
| # for i in result_sp: | |||||
| # Gn[i[0]] = i[1] | |||||
| # or | |||||
| # getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||||
| # for i, g in tqdm( | |||||
| # pool.map(getsp_partial, range(0, len(Gn))), | |||||
| # desc='getting sp graphs', | |||||
| # file=sys.stdout): | |||||
| # Gn[i] = g | |||||
| # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||||
| # sp_ml = [0] * len(Gn) # shortest path matrices | |||||
| # for i in result_sp: | |||||
| # sp_ml[i[0]] = i[1] | |||||
| # edge_x_g = [[] for i in range(len(sp_ml))] | |||||
| # edge_y_g = [[] for i in range(len(sp_ml))] | |||||
| # edge_w_g = [[] for i in range(len(sp_ml))] | |||||
| # for idx, item in enumerate(sp_ml): | |||||
| # for i1 in range(len(item)): | |||||
| # for i2 in range(i1 + 1, len(item)): | |||||
| # if item[i1, i2] != np.inf: | |||||
| # edge_x_g[idx].append(i1) | |||||
| # edge_y_g[idx].append(i2) | |||||
| # edge_w_g[idx].append(item[i1, i2]) | |||||
| # print(len(edge_x_g[0])) | |||||
| # print(len(edge_y_g[0])) | |||||
| # print(len(edge_w_g[0])) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||||
| # # ---- use pool.map to parallel. ---- | |||||
| # # result_perf = pool.map(do_partial, itr) | |||||
| # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.map(do_partial, itr), desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
| # result_perf = Parallel( | |||||
| # n_jobs=n_jobs, verbose=10)( | |||||
| # delayed(do_partial)(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # result_perf = [ | |||||
| # do_partial(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # ] | |||||
| # for i in result_perf: | |||||
| # Kmatrix[i[0]][i[1]] = i[2] | |||||
| # Kmatrix[i[1]][i[0]] = i[2] | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # from itertools import combinations_with_replacement | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print( | |||||
| "\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time, idx | |||||
| node_label='atom', | |||||
| edge_weight=None, | |||||
| node_kernels=None, | |||||
| parallel='imap_unordered', | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| verbose=True): | |||||
| """Calculate shortest-path kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as label. The default node label is atom. | |||||
| edge_weight : string | |||||
| Edge attribute name corresponding to the edge weight. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns an | |||||
| number as the kernel value. Ignored when nodes are unlabeled. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||||
| """ | |||||
| # pre-process | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
| Gn = [g.copy() for g in Gn] | |||||
| weight = None | |||||
| if edge_weight is None: | |||||
| if verbose: | |||||
| print('\n None edge weight specified. Set all weight to 1.\n') | |||||
| else: | |||||
| try: | |||||
| some_weight = list( | |||||
| nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
| if isinstance(some_weight, (float, int)): | |||||
| weight = edge_weight | |||||
| else: | |||||
| if verbose: | |||||
| print( | |||||
| '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
| % edge_weight) | |||||
| except: | |||||
| if verbose: | |||||
| print( | |||||
| '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
| % edge_weight) | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||||
| node_label=node_label) | |||||
| # remove graphs with no edges, as no sp can be found in their structures, | |||||
| # so the kernel between such a graph and itself will be zero. | |||||
| len_gn = len(Gn) | |||||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||||
| idx = [G[0] for G in Gn] | |||||
| Gn = [G[1] for G in Gn] | |||||
| if len(Gn) != len_gn: | |||||
| if verbose: | |||||
| print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||||
| (len_gn - len(Gn))) | |||||
| start_time = time.time() | |||||
| if parallel == 'imap_unordered': | |||||
| pool = Pool(n_jobs) | |||||
| # get shortest path graphs of Gn | |||||
| getsp_partial = partial(wrapper_getSPGraph, weight) | |||||
| itr = zip(Gn, range(0, len(Gn))) | |||||
| if chunksize is None: | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| # # use default chunksize as pool.map when iterable is less than 100 | |||||
| # chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||||
| # if extra: | |||||
| # chunksize += 1 | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| if verbose: | |||||
| iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
| desc='getting sp graphs', file=sys.stdout) | |||||
| else: | |||||
| iterator = pool.imap_unordered(getsp_partial, itr, chunksize) | |||||
| for i, g in iterator: | |||||
| Gn[i] = g | |||||
| pool.close() | |||||
| pool.join() | |||||
| elif parallel is None: | |||||
| pass | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): | |||||
| # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) | |||||
| # # ---- use pool.map to parallel ---- | |||||
| # result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||||
| # for i in result_sp: | |||||
| # Gn[i[0]] = i[1] | |||||
| # or | |||||
| # getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||||
| # for i, g in tqdm( | |||||
| # pool.map(getsp_partial, range(0, len(Gn))), | |||||
| # desc='getting sp graphs', | |||||
| # file=sys.stdout): | |||||
| # Gn[i] = g | |||||
| # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||||
| # sp_ml = [0] * len(Gn) # shortest path matrices | |||||
| # for i in result_sp: | |||||
| # sp_ml[i[0]] = i[1] | |||||
| # edge_x_g = [[] for i in range(len(sp_ml))] | |||||
| # edge_y_g = [[] for i in range(len(sp_ml))] | |||||
| # edge_w_g = [[] for i in range(len(sp_ml))] | |||||
| # for idx, item in enumerate(sp_ml): | |||||
| # for i1 in range(len(item)): | |||||
| # for i2 in range(i1 + 1, len(item)): | |||||
| # if item[i1, i2] != np.inf: | |||||
| # edge_x_g[idx].append(i1) | |||||
| # edge_y_g[idx].append(i2) | |||||
| # edge_w_g[idx].append(item[i1, i2]) | |||||
| # print(len(edge_x_g[0])) | |||||
| # print(len(edge_y_g[0])) | |||||
| # print(len(edge_w_g[0])) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # # ---- use pool.map to parallel. ---- | |||||
| # # result_perf = pool.map(do_partial, itr) | |||||
| # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.map(do_partial, itr), desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
| # result_perf = Parallel( | |||||
| # n_jobs=n_jobs, verbose=10)( | |||||
| # delayed(do_partial)(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # result_perf = [ | |||||
| # do_partial(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # ] | |||||
| # for i in result_perf: | |||||
| # Kmatrix[i[0]][i[1]] = i[2] | |||||
| # Kmatrix[i[1]][i[0]] = i[2] | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # from itertools import combinations_with_replacement | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
| # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print( | |||||
| "\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time, idx | |||||
| def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | ||||
| kernel = 0 | |||||
| # compute shortest path matrices first, method borrowed from FCSP. | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if ds_attrs['node_labeled']: | |||||
| # node symb and non-synb labeled | |||||
| if ds_attrs['node_attr_dim'] > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1, n2 in product( | |||||
| g1.nodes(data=True), g2.nodes(data=True)): | |||||
| vk_dict[(n1[0], n2[0])] = kn( | |||||
| n1[1][node_label], n2[1][node_label], | |||||
| n1[1]['attributes'], n2[1]['attributes']) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||||
| n2[1][node_label]) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if ds_attrs['node_attr_dim'] > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||||
| n2[1]['attributes']) | |||||
| # node unlabeled | |||||
| else: | |||||
| for e1, e2 in product( | |||||
| g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| kernel += 1 | |||||
| return kernel | |||||
| # compute graph kernels | |||||
| if ds_attrs['is_directed']: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||||
| e2[1])] | |||||
| kn1 = nk11 * nk22 | |||||
| kernel += kn1 | |||||
| else: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||||
| nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||||
| e1[0], e2[1])], vk_dict[(e1[1], | |||||
| e2[0])], vk_dict[(e1[1], | |||||
| e2[1])] | |||||
| kn1 = nk11 * nk22 | |||||
| kn2 = nk12 * nk21 | |||||
| kernel += kn1 + kn2 | |||||
| # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||||
| # # compute vertex kernels | |||||
| # try: | |||||
| # vk_mat = np.zeros((nx.number_of_nodes(g1), | |||||
| # nx.number_of_nodes(g2))) | |||||
| # g1nl = enumerate(g1.nodes(data=True)) | |||||
| # g2nl = enumerate(g2.nodes(data=True)) | |||||
| # for i1, n1 in g1nl: | |||||
| # for i2, n2 in g2nl: | |||||
| # vk_mat[i1][i2] = kn( | |||||
| # n1[1][node_label], n2[1][node_label], | |||||
| # [n1[1]['attributes']], [n2[1]['attributes']]) | |||||
| # range1 = range(0, len(edge_w_g[i])) | |||||
| # range2 = range(0, len(edge_w_g[j])) | |||||
| # for i1 in range1: | |||||
| # x1 = edge_x_g[i][i1] | |||||
| # y1 = edge_y_g[i][i1] | |||||
| # w1 = edge_w_g[i][i1] | |||||
| # for i2 in range2: | |||||
| # x2 = edge_x_g[j][i2] | |||||
| # y2 = edge_y_g[j][i2] | |||||
| # w2 = edge_w_g[j][i2] | |||||
| # ke = (w1 == w2) | |||||
| # if ke > 0: | |||||
| # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||||
| # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||||
| # kernel += kn1 + kn2 | |||||
| return kernel | |||||
| kernel = 0 | |||||
| # compute shortest path matrices first, method borrowed from FCSP. | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if ds_attrs['node_labeled']: | |||||
| # node symb and non-synb labeled | |||||
| if ds_attrs['node_attr_dim'] > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1, n2 in product( | |||||
| g1.nodes(data=True), g2.nodes(data=True)): | |||||
| vk_dict[(n1[0], n2[0])] = kn( | |||||
| n1[1][node_label], n2[1][node_label], | |||||
| n1[1]['attributes'], n2[1]['attributes']) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||||
| n2[1][node_label]) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if ds_attrs['node_attr_dim'] > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||||
| n2[1]['attributes']) | |||||
| # node unlabeled | |||||
| else: | |||||
| for e1, e2 in product( | |||||
| g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| kernel += 1 | |||||
| return kernel | |||||
| # compute graph kernels | |||||
| if ds_attrs['is_directed']: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||||
| e2[1])] | |||||
| kn1 = nk11 * nk22 | |||||
| kernel += kn1 | |||||
| else: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||||
| nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||||
| e1[0], e2[1])], vk_dict[(e1[1], | |||||
| e2[0])], vk_dict[(e1[1], | |||||
| e2[1])] | |||||
| kn1 = nk11 * nk22 | |||||
| kn2 = nk12 * nk21 | |||||
| kernel += kn1 + kn2 | |||||
| # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||||
| # # compute vertex kernels | |||||
| # try: | |||||
| # vk_mat = np.zeros((nx.number_of_nodes(g1), | |||||
| # nx.number_of_nodes(g2))) | |||||
| # g1nl = enumerate(g1.nodes(data=True)) | |||||
| # g2nl = enumerate(g2.nodes(data=True)) | |||||
| # for i1, n1 in g1nl: | |||||
| # for i2, n2 in g2nl: | |||||
| # vk_mat[i1][i2] = kn( | |||||
| # n1[1][node_label], n2[1][node_label], | |||||
| # [n1[1]['attributes']], [n2[1]['attributes']]) | |||||
| # range1 = range(0, len(edge_w_g[i])) | |||||
| # range2 = range(0, len(edge_w_g[j])) | |||||
| # for i1 in range1: | |||||
| # x1 = edge_x_g[i][i1] | |||||
| # y1 = edge_y_g[i][i1] | |||||
| # w1 = edge_w_g[i][i1] | |||||
| # for i2 in range2: | |||||
| # x2 = edge_x_g[j][i2] | |||||
| # y2 = edge_y_g[j][i2] | |||||
| # w2 = edge_w_g[j][i2] | |||||
| # ke = (w1 == w2) | |||||
| # if ke > 0: | |||||
| # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||||
| # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||||
| # kernel += kn1 + kn2 | |||||
| return kernel | |||||
| def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): | def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): | ||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||||
| #def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | #def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | ||||
| # g1 = itr_item[0][0] | |||||
| # g2 = itr_item[0][1] | |||||
| # i = itr_item[1][0] | |||||
| # j = itr_item[1][1] | |||||
| # return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||||
| # g1 = itr_item[0][0] | |||||
| # g2 = itr_item[0][1] | |||||
| # i = itr_item[1][0] | |||||
| # j = itr_item[1][1] | |||||
| # return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||||
| def wrapper_getSPGraph(weight, itr_item): | def wrapper_getSPGraph(weight, itr_item): | ||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| return i, getSPGraph(g, edge_weight=weight) | |||||
| # return i, nx.floyd_warshall_numpy(g, weight=weight) | |||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| return i, getSPGraph(g, edge_weight=weight) | |||||
| # return i, nx.floyd_warshall_numpy(g, weight=weight) | |||||
| @@ -27,6 +27,7 @@ def treeletkernel(*args, | |||||
| edge_label='bond_type', | edge_label='bond_type', | ||||
| parallel='imap_unordered', | parallel='imap_unordered', | ||||
| n_jobs=None, | n_jobs=None, | ||||
| chunksize=None, | |||||
| verbose=True): | verbose=True): | ||||
| """Calculate treelet graph kernels between graphs. | """Calculate treelet graph kernels between graphs. | ||||
| @@ -92,10 +93,11 @@ def treeletkernel(*args, | |||||
| # time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| itr = zip(Gn, range(0, len(Gn))) | itr = zip(Gn, range(0, len(Gn))) | ||||
| if len(Gn) < 100 * n_jobs: | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| if chunksize is None: | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| canonkeys = [[] for _ in range(len(Gn))] | canonkeys = [[] for _ in range(len(Gn))] | ||||
| get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | ||||
| labeled, ds_attrs['is_directed']) | labeled, ds_attrs['is_directed']) | ||||
| @@ -115,7 +117,7 @@ def treeletkernel(*args, | |||||
| G_canonkeys = canonkeys_toshare | G_canonkeys = canonkeys_toshare | ||||
| do_partial = partial(wrapper_treeletkernel_do, sub_kernel) | do_partial = partial(wrapper_treeletkernel_do, sub_kernel) | ||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | ||||
| glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose) | |||||
| glbv=(canonkeys,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # ---- do not use parallelization. ---- | # ---- do not use parallelization. ---- | ||||
| elif parallel == None: | elif parallel == None: | ||||
| @@ -30,6 +30,7 @@ def weisfeilerlehmankernel(*args, | |||||
| base_kernel='subtree', | base_kernel='subtree', | ||||
| parallel=None, | parallel=None, | ||||
| n_jobs=None, | n_jobs=None, | ||||
| chunksize=None, | |||||
| verbose=True): | verbose=True): | ||||
| """Calculate Weisfeiler-Lehman kernels between graphs. | """Calculate Weisfeiler-Lehman kernels between graphs. | ||||
| @@ -91,7 +92,7 @@ def weisfeilerlehmankernel(*args, | |||||
| # for WL subtree kernel | # for WL subtree kernel | ||||
| if base_kernel == 'subtree': | if base_kernel == 'subtree': | ||||
| Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||||
| Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose) | |||||
| # for WL shortest path kernel | # for WL shortest path kernel | ||||
| elif base_kernel == 'sp': | elif base_kernel == 'sp': | ||||
| @@ -113,7 +114,7 @@ def weisfeilerlehmankernel(*args, | |||||
| return Kmatrix, run_time | return Kmatrix, run_time | ||||
| def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose): | |||||
| def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose): | |||||
| """Calculate Weisfeiler-Lehman kernels between graphs. | """Calculate Weisfeiler-Lehman kernels between graphs. | ||||
| Parameters | Parameters | ||||
| @@ -146,7 +147,7 @@ def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | all_num_of_each_label.append(dict(Counter(labels_ori))) | ||||
| # calculate subtree kernel with the 0th iteration and add it to the final kernel | # calculate subtree kernel with the 0th iteration and add it to the final kernel | ||||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) | |||||
| # iterate each height | # iterate each height | ||||
| for h in range(1, height + 1): | for h in range(1, height + 1): | ||||
| @@ -304,7 +305,7 @@ def wrapper_wl_iteration(node_label, itr_item): | |||||
| return i, all_multisets | return i, all_multisets | ||||
| def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): | |||||
| def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose): | |||||
| """Compute kernel matrix using the base kernel. | """Compute kernel matrix using the base kernel. | ||||
| """ | """ | ||||
| if parallel == 'imap_unordered': | if parallel == 'imap_unordered': | ||||
| @@ -314,7 +315,7 @@ def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, | |||||
| G_alllabels = alllabels_toshare | G_alllabels = alllabels_toshare | ||||
| do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) | do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) | ||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | ||||
| glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose) | |||||
| glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| elif parallel == None: | elif parallel == None: | ||||
| for i in range(len(Kmatrix)): | for i in range(len(Kmatrix)): | ||||
| for j in range(i, len(Kmatrix)): | for j in range(i, len(Kmatrix)): | ||||
| @@ -24,7 +24,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||||
| n_jobs = multiprocessing.cpu_count() | n_jobs = multiprocessing.cpu_count() | ||||
| with Pool(processes=n_jobs, initializer=init_worker, | with Pool(processes=n_jobs, initializer=init_worker, | ||||
| initargs=glbv) as pool: | initargs=glbv) as pool: | ||||
| if chunksize == None: | |||||
| if chunksize is None: | |||||
| if len_itr < 100 * n_jobs: | if len_itr < 100 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| else: | else: | ||||
| @@ -39,7 +39,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||||
| if n_jobs == None: | if n_jobs == None: | ||||
| n_jobs = multiprocessing.cpu_count() | n_jobs = multiprocessing.cpu_count() | ||||
| with Pool(processes=n_jobs) as pool: | with Pool(processes=n_jobs) as pool: | ||||
| if chunksize == None: | |||||
| if chunksize is None: | |||||
| if len_itr < 100 * n_jobs: | if len_itr < 100 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| else: | else: | ||||