| @@ -79,3 +79,9 @@ outputs/ | |||
| # pyCharm. | |||
| .idea/ | |||
| # tests. | |||
| gklearn/tests/datasets/ | |||
| # Experiments. | |||
| gklearn/experiments/datasets/ | |||
| @@ -1,5 +1,5 @@ | |||
| # graphkit-learn | |||
| [](https://travis-ci.org/jajupmochi/graphkit-learn) | |||
| [](https://travis-ci.com/jajupmochi/graphkit-learn) | |||
| [](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | |||
| [](https://codecov.io/gh/jajupmochi/graphkit-learn) | |||
| [](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | |||
| @@ -68,7 +68,7 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i | |||
| * [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1] | |||
| * Exponential | |||
| * Geometric | |||
| * [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||
| * [The marginalized kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||
| * With tottering [2] | |||
| * Without tottering [7] | |||
| * [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3] | |||
| @@ -40,6 +40,7 @@ class Dataset(object): | |||
| self._edge_attr_dim = None | |||
| self._class_number = None | |||
| self._ds_name = None | |||
| self._task_type = None | |||
| if inputs is None: | |||
| self._graphs = None | |||
| @@ -117,11 +118,16 @@ class Dataset(object): | |||
| ds_file = [os.path.join(path, fn) for fn in load_files[0]] | |||
| fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None | |||
| # Get extra_params. | |||
| if 'extra_params' in DATASET_META[ds_name]: | |||
| kwargs = DATASET_META[ds_name]['extra_params'] | |||
| else: | |||
| kwargs = {} | |||
| # Get the task type that is associated with the dataset. If it is classification, get the number of classes. | |||
| self._get_task_type(ds_name) | |||
| self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data | |||
| self._node_labels = label_names['node_labels'] | |||
| @@ -276,7 +282,8 @@ class Dataset(object): | |||
| 'edge_attr_dim', | |||
| 'class_number', | |||
| 'all_degree_entropy', | |||
| 'ave_degree_entropy' | |||
| 'ave_degree_entropy', | |||
| 'class_type' | |||
| ] | |||
| # dataset size | |||
| @@ -408,7 +415,7 @@ class Dataset(object): | |||
| if 'class_number' in keys: | |||
| if self._class_number is None: | |||
| self._class_number = self._get_class_number() | |||
| self._class_number = self._get_class_num() | |||
| infos['class_number'] = self._class_number | |||
| if 'node_attr_dim' in keys: | |||
| @@ -437,6 +444,11 @@ class Dataset(object): | |||
| base = None | |||
| infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||
| if 'task_type' in keys: | |||
| if self._task_type is None: | |||
| self._task_type = self._get_task_type() | |||
| infos['task_type'] = self._task_type | |||
| return infos | |||
| @@ -790,6 +802,13 @@ class Dataset(object): | |||
| return degree_entropy | |||
| def _get_task_type(self, ds_name): | |||
| if 'task_type' in DATASET_META[ds_name]: | |||
| self._task_type = DATASET_META[ds_name]['task_type'] | |||
| if self._task_type == 'classification' and self._class_number is None and 'class_number' in DATASET_META[ds_name]: | |||
| self._class_number = DATASET_META[ds_name]['class_number'] | |||
| @property | |||
| def graphs(self): | |||
| return self._graphs | |||
| @@ -13,7 +13,7 @@ import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import time | |||
| from utils import get_dataset, set_edit_cost_consts | |||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation | |||
| import sys | |||
| from group_results import group_trials, check_group_existence, update_group_marker | |||
| @@ -37,7 +37,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
| 'attr_distance': 'euclidean', | |||
| 'ratio_runs_from_initial_solutions': 0.25, | |||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
| # parallel threads. Set to 1 automatically if parallel=True in compute_geds(). | |||
| 'threads': multiprocessing.cpu_count(), | |||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
| } | |||
| @@ -98,7 +98,7 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| # Group trials and Remove single files. | |||
| # Group trials and remove single files. | |||
| # @todo: if the program stops between the following lines, then there may be errors. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||
| @@ -111,21 +111,25 @@ def results_for_a_dataset(ds_name): | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| for ratio in ratio_list: | |||
| for params in list(param_grid): | |||
| print() | |||
| print('Ratio:', ratio) | |||
| for num_solutions in num_solutions_list: | |||
| print() | |||
| print('# of solutions:', num_solutions) | |||
| save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
| print(params) | |||
| save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) | |||
| def get_param_lists(ds_name, test=False): | |||
| if test: | |||
| num_solutions_list = [1, 10, 20, 30, 40, 50] | |||
| def get_param_lists(ds_name, mode='test'): | |||
| if mode == 'test': | |||
| num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] | |||
| ratio_list = [10] | |||
| return num_solutions_list, ratio_list | |||
| elif mode == 'simple': | |||
| from sklearn.model_selection import ParameterGrid | |||
| param_grid = ParameterGrid([ | |||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, | |||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) | |||
| # print(list(param_grid)) | |||
| if ds_name == 'AIDS_symb': | |||
| num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| @@ -133,7 +137,7 @@ def get_param_lists(ds_name, test=False): | |||
| num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | |||
| return num_solutions_list, ratio_list | |||
| return param_grid | |||
| if __name__ == '__main__': | |||
| @@ -141,7 +145,7 @@ if __name__ == '__main__': | |||
| ds_name_list = sys.argv[1:] | |||
| else: | |||
| ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||
| # ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] | |||
| # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | |||
| # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | |||
| @@ -151,5 +155,5 @@ if __name__ == '__main__': | |||
| for ds_name in ds_name_list: | |||
| print() | |||
| print('Dataset:', ds_name) | |||
| num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) | |||
| param_grid = get_param_lists(ds_name, mode='simple') | |||
| results_for_a_dataset(ds_name) | |||
| @@ -16,12 +16,12 @@ from gklearn.experiments import DATASET_ROOT | |||
| def get_dataset(ds_name): | |||
| # The node/edge labels that will not be used in the computation. | |||
| # if ds_name == 'MAO': | |||
| # irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
| # if ds_name == 'Monoterpenoides': | |||
| # irrelevant_labels = {'edge_labels': ['valence']} | |||
| # elif ds_name == 'MUTAG': | |||
| # irrelevant_labels = {'edge_labels': ['label_0']} | |||
| # if ds_name == 'MAO': | |||
| # irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
| # if ds_name == 'Monoterpenoides': | |||
| # irrelevant_labels = {'edge_labels': ['valence']} | |||
| # elif ds_name == 'MUTAG': | |||
| # irrelevant_labels = {'edge_labels': ['label_0']} | |||
| if ds_name == 'AIDS_symb': | |||
| irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||
| ds_name = 'AIDS' | |||
| @@ -49,34 +49,36 @@ def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='unif | |||
| def nested_keys_exists(element, *keys): | |||
| ''' | |||
| Check if *keys (nested) exists in `element` (dict). | |||
| ''' | |||
| if not isinstance(element, dict): | |||
| raise AttributeError('keys_exists() expects dict as first argument.') | |||
| if len(keys) == 0: | |||
| raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||
| _element = element | |||
| for key in keys: | |||
| try: | |||
| _element = _element[key] | |||
| except KeyError: | |||
| return False | |||
| return True | |||
| ''' | |||
| Check if *keys (nested) exists in `element` (dict). | |||
| ''' | |||
| if not isinstance(element, dict): | |||
| raise AttributeError('keys_exists() expects dict as first argument.') | |||
| if len(keys) == 0: | |||
| raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||
| _element = element | |||
| for key in keys: | |||
| try: | |||
| _element = _element[key] | |||
| except KeyError: | |||
| return False | |||
| return True | |||
| # Check average relative error along elements in two ged matrices. | |||
| def matrices_ave_relative_error(m1, m2): | |||
| error = 0 | |||
| base = 0 | |||
| for i in range(m1.shape[0]): | |||
| for j in range(m1.shape[1]): | |||
| error += np.abs(m1[i, j] - m2[i, j]) | |||
| base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||
| error = 0 | |||
| base = 0 | |||
| for i in range(m1.shape[0]): | |||
| for j in range(m1.shape[1]): | |||
| error += np.abs(m1[i, j] - m2[i, j]) | |||
| # base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) | |||
| base += (m1[i, j] + m2[i, j]) # Require only 25% of the time of "base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))". | |||
| return error / base | |||
| base = base / 2 | |||
| return error / base | |||
| def compute_relative_error(ged_mats): | |||
| @@ -92,9 +94,9 @@ def compute_relative_error(ged_mats): | |||
| errors = [] | |||
| for i, mat in enumerate(ged_mats): | |||
| err = matrices_ave_relative_error(mat, ged_mat_s) | |||
| # if not per_correct: | |||
| # print('matrix # ', str(i)) | |||
| # pass | |||
| # if not per_correct: | |||
| # print('matrix # ', str(i)) | |||
| # pass | |||
| errors.append(err) | |||
| else: | |||
| errors = [0] | |||
| @@ -107,11 +109,11 @@ def parse_group_file_name(fn): | |||
| key1 = splits_all[1] | |||
| pos2 = splits_all[2].rfind('_') | |||
| # key2 = splits_all[2][:pos2] | |||
| # key2 = splits_all[2][:pos2] | |||
| val2 = splits_all[2][pos2+1:] | |||
| pos3 = splits_all[3].rfind('_') | |||
| # key3 = splits_all[3][:pos3] | |||
| # key3 = splits_all[3][:pos3] | |||
| val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] | |||
| return key1, val2, val3 | |||
| @@ -232,7 +234,7 @@ def set_axis_style(ax): | |||
| ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') | |||
| ax.tick_params(axis='x', pad=-2) | |||
| ax.tick_params(axis='y', labelrotation=-40, pad=-2) | |||
| # ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||
| # ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||
| ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) | |||
| ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) | |||
| ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) | |||
| @@ -240,16 +242,99 @@ def set_axis_style(ax): | |||
| return | |||
| def dichotomous_permutation(arr, layer=0): | |||
| import math | |||
| # def seperate_arr(arr, new_arr): | |||
| # if (length % 2) == 0: | |||
| # half = int(length / 2) | |||
| # new_arr += [arr[half - 1], arr[half]] | |||
| # subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| # else: | |||
| # half = math.floor(length / 2) | |||
| # new_arr.append(arr[half]) | |||
| # subarr1 = [arr[i] for i in range(1, half)] | |||
| # subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| # subarrs = [subarr1, subarr2] | |||
| # return subarrs | |||
| if layer == 0: | |||
| length = len(arr) | |||
| if length <= 2: | |||
| return arr | |||
| new_arr = [arr[0], arr[-1]] | |||
| if (length % 2) == 0: | |||
| half = int(length / 2) | |||
| new_arr += [arr[half - 1], arr[half]] | |||
| subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| else: | |||
| half = math.floor(length / 2) | |||
| new_arr.append(arr[half]) | |||
| subarr1 = [arr[i] for i in range(1, half)] | |||
| subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| subarrs = [subarr1, subarr2] | |||
| # subarrs = seperate_arr(arr, new_arr) | |||
| new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
| else: | |||
| new_arr = [] | |||
| subarrs = [] | |||
| for a in arr: | |||
| length = len(a) | |||
| if length <= 2: | |||
| new_arr += a | |||
| else: | |||
| # subarrs += seperate_arr(a, new_arr) | |||
| if (length % 2) == 0: | |||
| half = int(length / 2) | |||
| new_arr += [a[half - 1], a[half]] | |||
| subarr1 = [a[i] for i in range(0, half - 1)] | |||
| else: | |||
| half = math.floor(length / 2) | |||
| new_arr.append(a[half]) | |||
| subarr1 = [a[i] for i in range(0, half)] | |||
| subarr2 = [a[i] for i in range(half + 1, length)] | |||
| subarrs += [subarr1, subarr2] | |||
| if len(subarrs) > 0: | |||
| new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
| return new_arr | |||
| # length = len(arr) | |||
| # if length <= 2: | |||
| # return arr | |||
| # new_arr = [arr[0], arr[-1]] | |||
| # if (length % 2) == 0: | |||
| # half = int(length / 2) | |||
| # new_arr += [arr[half - 1], arr[half]] | |||
| # subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| # else: | |||
| # half = math.floor(length / 2) | |||
| # new_arr.append(arr[half]) | |||
| # subarr1 = [arr[i] for i in range(1, half)] | |||
| # subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| # if len(subarr1) > 0: | |||
| # new_arr += dichotomous_permutation(subarr1) | |||
| # if len(subarr2) > 0: | |||
| # new_arr += dichotomous_permutation(subarr2) | |||
| # return new_arr | |||
| if __name__ == '__main__': | |||
| root_dir = 'outputs/CRIANN/' | |||
| # for dir_ in sorted(os.listdir(root_dir)): | |||
| # if os.path.isdir(root_dir): | |||
| # full_dir = os.path.join(root_dir, dir_) | |||
| # print('---', full_dir,':') | |||
| # save_dir = os.path.join(full_dir, 'groups/') | |||
| # if os.path.exists(save_dir): | |||
| # try: | |||
| # get_relative_errors(save_dir) | |||
| # except Exception as exp: | |||
| # print('An exception occured when running this experiment:') | |||
| # print(repr(exp)) | |||
| # for dir_ in sorted(os.listdir(root_dir)): | |||
| # if os.path.isdir(root_dir): | |||
| # full_dir = os.path.join(root_dir, dir_) | |||
| # print('---', full_dir,':') | |||
| # save_dir = os.path.join(full_dir, 'groups/') | |||
| # if os.path.exists(save_dir): | |||
| # try: | |||
| # get_relative_errors(save_dir) | |||
| # except Exception as exp: | |||
| # print('An exception occured when running this experiment:') | |||
| # print(repr(exp)) | |||
| @@ -0,0 +1,29 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Jan 26 09:53:33 2021 | |||
| @author: ljia | |||
| """ | |||
| if __name__ == '__main__': | |||
| tasks = [ | |||
| {'path': 'thesis/graph_kernels/fcsp', | |||
| 'file': 'run_jobs_compare_fcsp.py' | |||
| }, | |||
| {'path': 'thesis/graph_kernels/fcsp', | |||
| 'file': 'run_jobs_compare_fcsp_space.py' | |||
| }, | |||
| {'path': 'ged/stability', | |||
| 'file': 'run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py' | |||
| }, | |||
| ] | |||
| import os | |||
| for t in tasks: | |||
| print(t['file']) | |||
| command = '' | |||
| command += 'cd ' + t['path'] + '\n' | |||
| command += 'python3 ' + t['file'] + '\n' | |||
| # command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' | |||
| os.system(command) | |||
| @@ -19,7 +19,15 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'NCI1', 'False'), | |||
| ('ShortestPath', 'NCI109', 'False'), | |||
| ('StructuralSP', 'NCI109', 'True'), | |||
| ('ShortestPath', 'NCI-H23', 'True'), | |||
| ('ShortestPath', 'NCI-H23', 'False'), | |||
| ('StructuralSP', 'NCI-H23', 'True'), | |||
| ('StructuralSP', 'NCI-H23', 'False'), | |||
| ('StructuralSP', 'NCI109', 'False'), | |||
| ('ShortestPath', 'NCI-H23H', 'True'), | |||
| ('ShortestPath', 'NCI-H23H', 'False'), | |||
| ('StructuralSP', 'NCI-H23H', 'True'), | |||
| ('StructuralSP', 'NCI-H23H', 'False'), | |||
| ('ShortestPath', 'DD', 'True'), | |||
| ('ShortestPath', 'DD', 'False'), | |||
| ('StructuralSP', 'BZR', 'False'), | |||
| @@ -27,9 +35,37 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'COX2', 'False'), | |||
| ('ShortestPath', 'DHFR', 'False'), | |||
| ('StructuralSP', 'DHFR', 'False'), | |||
| ('ShortestPath', 'MCF-7', 'True'), | |||
| ('ShortestPath', 'MCF-7', 'False'), | |||
| ('StructuralSP', 'MCF-7', 'True'), | |||
| ('StructuralSP', 'MCF-7', 'False'), | |||
| ('ShortestPath', 'MCF-7H', 'True'), | |||
| ('ShortestPath', 'MCF-7H', 'False'), | |||
| ('StructuralSP', 'MCF-7H', 'True'), | |||
| ('StructuralSP', 'MCF-7H', 'False'), | |||
| ('ShortestPath', 'MOLT-4', 'True'), | |||
| ('ShortestPath', 'MOLT-4', 'False'), | |||
| ('StructuralSP', 'MOLT-4', 'True'), | |||
| ('StructuralSP', 'MOLT-4', 'False'), | |||
| ('ShortestPath', 'MOLT-4H', 'True'), | |||
| ('ShortestPath', 'MOLT-4H', 'False'), | |||
| ('StructuralSP', 'MOLT-4H', 'True'), | |||
| ('StructuralSP', 'MOLT-4H', 'False'), | |||
| ('StructuralSP', 'OHSU', 'True'), | |||
| ('StructuralSP', 'OHSU', 'False'), | |||
| ('StructuralSP', 'SYNTHETIC', 'False'), | |||
| ('ShortestPath', 'OVCAR-8', 'True'), | |||
| ('ShortestPath', 'OVCAR-8', 'False'), | |||
| ('StructuralSP', 'OVCAR-8', 'True'), | |||
| ('StructuralSP', 'OVCAR-8', 'False'), | |||
| ('ShortestPath', 'OVCAR-8H', 'True'), | |||
| ('ShortestPath', 'OVCAR-8H', 'False'), | |||
| ('StructuralSP', 'OVCAR-8H', 'True'), | |||
| ('StructuralSP', 'OVCAR-8H', 'False'), | |||
| ('ShortestPath', 'P388', 'False'), | |||
| ('ShortestPath', 'P388', 'True'), | |||
| ('StructuralSP', 'P388', 'True'), | |||
| ('StructuralSP', 'Steroid', 'False'), | |||
| ('ShortestPath', 'SYNTHETIC', 'False'), | |||
| ('StructuralSP', 'SYNTHETIC', 'True'), | |||
| ('StructuralSP', 'SYNTHETIC', 'False'), | |||
| ('ShortestPath', 'SYNTHETICnew', 'False'), | |||
| @@ -47,6 +83,9 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'Mutagenicity', 'False'), | |||
| ('StructuralSP', 'REDDIT-BINARY', 'True'), | |||
| ('StructuralSP', 'REDDIT-BINARY', 'False'), | |||
| ('StructuralSP', 'Vitamin_D', 'False'), | |||
| ('ShortestPath', 'Web', 'True'), | |||
| ('ShortestPath', 'Web', 'False'), | |||
| }) | |||
| OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), | |||
| @@ -17,6 +17,7 @@ OUT_TIME_LIST = [] | |||
| OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
| ('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
| ('StructuralSP', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'AIDS', 'False'), | |||
| ('ShortestPath', 'DD', 'True'), | |||
| ('ShortestPath', 'DD', 'False'), | |||
| ('StructuralSP', 'DD', 'True'), | |||
| @@ -55,6 +56,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
| ('ShortestPath', 'P388H', 'False'), | |||
| ('StructuralSP', 'P388H', 'True'), | |||
| ('StructuralSP', 'P388H', 'False'), | |||
| ('StructuralSP', 'NCI1', 'False'), | |||
| ('ShortestPath', 'NCI-H23', 'True'), | |||
| ('ShortestPath', 'NCI-H23', 'False'), | |||
| ('StructuralSP', 'NCI-H23', 'True'), | |||
| @@ -63,6 +65,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
| ('ShortestPath', 'NCI-H23H', 'False'), | |||
| ('StructuralSP', 'NCI-H23H', 'True'), | |||
| ('StructuralSP', 'NCI-H23H', 'False'), | |||
| ('StructuralSP', 'OHSU', 'False'), | |||
| ('ShortestPath', 'OVCAR-8', 'True'), | |||
| ('ShortestPath', 'OVCAR-8', 'False'), | |||
| ('StructuralSP', 'OVCAR-8', 'True'), | |||
| @@ -208,11 +211,12 @@ def check_task_status(save_dir, *params): | |||
| # Check if the task is already computed. | |||
| file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| data = pickle.load(f) | |||
| if data['completed']: | |||
| return True | |||
| if os.path.getsize(file_name) > 0: | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| data = pickle.load(f) | |||
| if data['completed']: | |||
| return True | |||
| return False | |||
| @@ -7,7 +7,6 @@ __version__ = "0.1" | |||
| __author__ = "Linlin Jia" | |||
| __date__ = "November 2018" | |||
| from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
| from gklearn.kernels.graph_kernel import GraphKernel | |||
| from gklearn.kernels.common_walk import CommonWalk | |||
| @@ -24,6 +23,8 @@ from gklearn.kernels.path_up_to_h import PathUpToH | |||
| from gklearn.kernels.treelet import Treelet | |||
| from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree | |||
| from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
| # old version. | |||
| from gklearn.kernels.commonWalkKernel import commonwalkkernel | |||
| from gklearn.kernels.marginalizedKernel import marginalizedkernel | |||
| @@ -32,4 +33,4 @@ from gklearn.kernels.spKernel import spkernel | |||
| from gklearn.kernels.structuralspKernel import structuralspkernel | |||
| from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||
| from gklearn.kernels.treeletKernel import treeletkernel | |||
| from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
| from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
| @@ -47,7 +47,7 @@ class CommonWalk(GraphKernel): | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self._verbose >= 2)) | |||
| length=len_itr, verbose=(self.verbose >= 2)) | |||
| # direct product graph method - exponential | |||
| if self._compute_method == 'exp': | |||
| @@ -86,7 +86,7 @@ class CommonWalk(GraphKernel): | |||
| do_fun = self._wrapper_kernel_do_geo | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| @@ -100,9 +100,9 @@ class CommonWalk(GraphKernel): | |||
| # compute kernel list. | |||
| kernel_list = [None] * len(g_list) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', | |||
| file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| else: | |||
| iterator = range(len(g_list)) | |||
| @@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| def _compute_gm_series(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| lmda = self._weight | |||
| @@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | |||
| @@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| def _compute_gm_imap_unordered(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| # Compute Gram matrix. | |||
| @@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| # @todo: parallel this. | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| lmda = self._weight | |||
| @@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta): | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(g1, g_list[i], lmda) | |||
| @@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| # compute kernel list. | |||
| @@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
| # @todo: parallel this. | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
| def _compute_single_kernel_series(self, g1, g2): | |||
| self._check_edge_weight([g1] + [g2], self._verbose) | |||
| self._check_edge_weight([g1] + [g2], self.verbose) | |||
| self._check_graphs([g1] + [g2]) | |||
| lmda = self._weight | |||
| @@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta): | |||
| def _compute_gm_series(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| lmda = self._weight | |||
| @@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta): | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2)) | |||
| self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta): | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | |||
| @@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta): | |||
| def _compute_gm_imap_unordered(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| # Compute Gram matrix. | |||
| @@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta): | |||
| # @todo: parallel this. | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta): | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta): | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| lmda = self._weight | |||
| @@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta): | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(g1, g_list[i], lmda) | |||
| @@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta): | |||
| def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| # compute kernel list. | |||
| @@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta): | |||
| # Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
| g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
| # @todo: parallel this. | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
| if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
| @@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta): | |||
| def _compute_single_kernel_series(self, g1, g2): | |||
| self._check_edge_weight([g1] + [g2], self._verbose) | |||
| self._check_edge_weight([g1] + [g2], self.verbose) | |||
| self._check_graphs([g1] + [g2]) | |||
| lmda = self._weight | |||
| @@ -9,55 +9,433 @@ import numpy as np | |||
| import networkx as nx | |||
| import multiprocessing | |||
| import time | |||
| # from abc import ABC, abstractmethod | |||
| from sklearn.base import BaseEstimator # , TransformerMixin | |||
| from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, | |||
| from sklearn.exceptions import NotFittedError | |||
| from gklearn.utils import normalize_gram_matrix | |||
| class GraphKernel(object): | |||
| class GraphKernel(BaseEstimator): #, ABC): | |||
| """The basic graph kernel class. | |||
| def __init__(self): | |||
| self._graphs = None | |||
| self._parallel = '' | |||
| self._n_jobs = 0 | |||
| self._verbose = None | |||
| self._normalize = True | |||
| self._run_time = 0 | |||
| self._gram_matrix = None | |||
| self._gram_matrix_unnorm = None | |||
| Attributes | |||
| ---------- | |||
| _graphs : list | |||
| Stores the input graphs on fit input data. | |||
| Default format of the list objects is `NetworkX` graphs. | |||
| **We don't guarantee that the input graphs remain unchanged during the | |||
| computation.** | |||
| References | |||
| ---------- | |||
| https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | |||
| """ | |||
| def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): | |||
| """`__init__` for `GraphKernel` object.""" | |||
| # @todo: the default settings of the parameters are different from those in the self.compute method. | |||
| # self._graphs = None | |||
| self.parallel = parallel | |||
| self.n_jobs = n_jobs | |||
| self.chunksize = chunksize | |||
| self.normalize = normalize | |||
| self.verbose = verbose | |||
| # self._run_time = 0 | |||
| # self._gram_matrix = None | |||
| # self._gram_matrix_unnorm = None | |||
| def compute(self, *graphs, **kwargs): | |||
| self._parallel = kwargs.get('parallel', 'imap_unordered') | |||
| self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
| self._normalize = kwargs.get('normalize', True) | |||
| self._verbose = kwargs.get('verbose', 2) | |||
| ########################################################################## | |||
| # The following is the 1st paradigm to compute kernel matrix, which is | |||
| # compatible with `scikit-learn`. | |||
| # ------------------------------------------------------------------- | |||
| # Special thanks to the "GraKeL" library for providing an excellent template! | |||
| ########################################################################## | |||
| def fit(self, X, y=None): | |||
| """Fit a graph dataset for a transformer. | |||
| Parameters | |||
| ---------- | |||
| X : iterable | |||
| DESCRIPTION. | |||
| y : None, optional | |||
| There is no need of a target in a transformer, yet the `scikit-learn` | |||
| pipeline API requires this parameter. | |||
| Returns | |||
| ------- | |||
| object | |||
| Returns self. | |||
| """ | |||
| # self._is_tranformed = False | |||
| # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; | |||
| self.clear_attributes() | |||
| # Validate parameters for the transformer. | |||
| self.validate_parameters() | |||
| # Validate the input. | |||
| self._graphs = self.validate_input(X) | |||
| # self._X = X | |||
| # self._kernel = self._get_kernel_instance() | |||
| # Return the transformer. | |||
| return self | |||
| def transform(self, X): | |||
| """Compute the graph kernel matrix between given and fitted data. | |||
| Parameters | |||
| ---------- | |||
| X : TYPE | |||
| DESCRIPTION. | |||
| Raises | |||
| ------ | |||
| ValueError | |||
| DESCRIPTION. | |||
| Returns | |||
| ------- | |||
| None. | |||
| """ | |||
| # Check if method "fit" had been called. | |||
| check_is_fitted(self, '_graphs') | |||
| # Validate the input. | |||
| Y = self.validate_input(X) | |||
| # Transform: compute the graph kernel matrix. | |||
| kernel_matrix = self.compute_kernel_matrix(Y) | |||
| self._Y = Y | |||
| # Self transform must appear before the diagonal call on normilization. | |||
| self._is_transformed = True | |||
| if self.normalize: | |||
| X_diag, Y_diag = self.diagonals() | |||
| old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
| try: | |||
| kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) | |||
| except: | |||
| raise | |||
| finally: | |||
| np.seterr(**old_settings) | |||
| return kernel_matrix | |||
| def fit_transform(self, X): | |||
| """Fit and transform: compute Gram matrix on the same data. | |||
| Parameters | |||
| ---------- | |||
| X : list of graphs | |||
| Input graphs. | |||
| Returns | |||
| ------- | |||
| gram_matrix : numpy array, shape = [len(X), len(X)] | |||
| The Gram matrix of X. | |||
| """ | |||
| self.fit(X) | |||
| # Transform: compute Gram matrix. | |||
| gram_matrix = self.compute_kernel_matrix() | |||
| # Normalize. | |||
| if self.normalize: | |||
| self._X_diag = np.diagonal(gram_matrix).copy() | |||
| old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
| try: | |||
| gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) | |||
| except: | |||
| raise | |||
| finally: | |||
| np.seterr(**old_settings) | |||
| return gram_matrix | |||
| def get_params(self): | |||
| pass | |||
| def set_params(self): | |||
| pass | |||
| def clear_attributes(self): | |||
| if hasattr(self, '_X_diag'): | |||
| delattr(self, '_X_diag') | |||
| if hasattr(self, '_graphs'): | |||
| delattr(self, '_graphs') | |||
| if hasattr(self, '_Y'): | |||
| delattr(self, '_Y') | |||
| if hasattr(self, '_run_time'): | |||
| delattr(self, '_run_time') | |||
| def validate_parameters(self): | |||
| """Validate all parameters for the transformer. | |||
| Returns | |||
| ------- | |||
| None. | |||
| """ | |||
| if self.parallel is not None and self.parallel != 'imap_unordered': | |||
| raise ValueError('Parallel mode is not set correctly.') | |||
| if self.parallel == 'imap_unordered' and self.n_jobs is None: | |||
| self.n_jobs = multiprocessing.cpu_count() | |||
| def validate_input(self, X): | |||
| """Validate the given input and raise errors if it is invalid. | |||
| Parameters | |||
| ---------- | |||
| X : list | |||
| The input to check. Should be a list of graph. | |||
| Raises | |||
| ------ | |||
| ValueError | |||
| Raise if the input is not correct. | |||
| Returns | |||
| ------- | |||
| X : list | |||
| The input. A list of graph. | |||
| """ | |||
| if X is None: | |||
| raise ValueError('Please add graphs before computing.') | |||
| elif not isinstance(X, list): | |||
| raise ValueError('Cannot detect graphs.') | |||
| elif len(X) == 0: | |||
| raise ValueError('The graph list given is empty. No computation will be performed.') | |||
| return X | |||
| def compute_kernel_matrix(self, Y=None): | |||
| """Compute the kernel matrix between a given target graphs (Y) and | |||
| the fitted graphs (X / self._graphs) or the Gram matrix for the fitted | |||
| graphs (X / self._graphs). | |||
| Parameters | |||
| ---------- | |||
| Y : list of graphs, optional | |||
| The target graphs. The default is None. If None kernel is computed | |||
| between X and itself. | |||
| Returns | |||
| ------- | |||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
| The computed kernel matrix. | |||
| """ | |||
| if Y is None: | |||
| # Compute Gram matrix for self._graphs (X). | |||
| kernel_matrix = self._compute_gram_matrix() | |||
| # self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
| else: | |||
| # Compute kernel matrix between Y and self._graphs (X). | |||
| start_time = time.time() | |||
| if self.parallel == 'imap_unordered': | |||
| kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) | |||
| elif self.parallel is None: | |||
| kernel_matrix = self._compute_kernel_matrix_series(Y) | |||
| self._run_time = time.time() - start_time | |||
| if self.verbose: | |||
| print('Kernel matrix of size (%d, %d) built in %s seconds.' | |||
| % (len(Y), len(self._graphs), self._run_time)) | |||
| return kernel_matrix | |||
| def _compute_kernel_matrix_series(self, Y): | |||
| """Compute the kernel matrix between a given target graphs (Y) and | |||
| the fitted graphs (X / self._graphs) without parallelization. | |||
| Parameters | |||
| ---------- | |||
| Y : list of graphs, optional | |||
| The target graphs. | |||
| Returns | |||
| ------- | |||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
| The computed kernel matrix. | |||
| """ | |||
| kernel_matrix = np.zeros((len(Y), len(self._graphs))) | |||
| for i_y, g_y in enumerate(Y): | |||
| for i_x, g_x in enumerate(self._graphs): | |||
| kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) | |||
| return kernel_matrix | |||
| def _compute_kernel_matrix_imap_unordered(self, Y): | |||
| """Compute the kernel matrix between a given target graphs (Y) and | |||
| the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||
| Parameters | |||
| ---------- | |||
| Y : list of graphs, optional | |||
| The target graphs. | |||
| Returns | |||
| ------- | |||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
| The computed kernel matrix. | |||
| """ | |||
| raise Exception('Parallelization for kernel matrix is not implemented.') | |||
| def diagonals(self): | |||
| """Compute the kernel matrix diagonals of the fit/transformed data. | |||
| Returns | |||
| ------- | |||
| X_diag : numpy array | |||
| The diagonal of the kernel matrix between the fitted data. | |||
| This consists of each element calculated with itself. | |||
| Y_diag : numpy array | |||
| The diagonal of the kernel matrix, of the transform. | |||
| This consists of each element calculated with itself. | |||
| """ | |||
| # Check if method "fit" had been called. | |||
| check_is_fitted(self, ['_graphs']) | |||
| # Check if the diagonals of X exist. | |||
| try: | |||
| check_is_fitted(self, ['_X_diag']) | |||
| except NotFittedError: | |||
| # Compute diagonals of X. | |||
| self._X_diag = np.empty(shape=(len(self._graphs),)) | |||
| for i, x in enumerate(self._graphs): | |||
| self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | |||
| try: | |||
| # If transform has happened, return both diagonals. | |||
| check_is_fitted(self, ['_Y']) | |||
| self._Y_diag = np.empty(shape=(len(self._Y),)) | |||
| for (i, y) in enumerate(self._Y): | |||
| self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | |||
| return self._X_diag, self._Y_diag | |||
| except NotFittedError: | |||
| # Else just return both X_diag | |||
| return self._X_diag | |||
| # @abstractmethod | |||
| def pairwise_kernel(self, x, y): | |||
| """Compute pairwise kernel between two graphs. | |||
| Parameters | |||
| ---------- | |||
| x, y : NetworkX Graph. | |||
| Graphs bewteen which the kernel is computed. | |||
| Returns | |||
| ------- | |||
| kernel: float | |||
| The computed kernel. | |||
| # Notes | |||
| # ----- | |||
| # This method is abstract and must be implemented by a subclass. | |||
| """ | |||
| raise NotImplementedError('Pairwise kernel computation is not implemented!') | |||
| ########################################################################## | |||
| # The following is the 2nd paradigm to compute kernel matrix. It is | |||
| # simplified and not compatible with `scikit-learn`. | |||
| ########################################################################## | |||
| def compute(self, *graphs, **kwargs): | |||
| self.parallel = kwargs.get('parallel', 'imap_unordered') | |||
| self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
| self.normalize = kwargs.get('normalize', True) | |||
| self.verbose = kwargs.get('verbose', 2) | |||
| self.copy_graphs = kwargs.get('copy_graphs', True) | |||
| self.save_unnormed = kwargs.get('save_unnormed', True) | |||
| self.validate_parameters() | |||
| # If the inputs is a list of graphs. | |||
| if len(graphs) == 1: | |||
| if not isinstance(graphs[0], list): | |||
| raise Exception('Cannot detect graphs.') | |||
| elif len(graphs[0]) == 0: | |||
| raise Exception('The graph list given is empty. No computation was performed.') | |||
| else: | |||
| self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||
| if self.copy_graphs: | |||
| self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||
| else: | |||
| self._graphs = graphs | |||
| self._gram_matrix = self._compute_gram_matrix() | |||
| self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
| if self._normalize: | |||
| if self.save_unnormed: | |||
| self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
| if self.normalize: | |||
| self._gram_matrix = normalize_gram_matrix(self._gram_matrix) | |||
| return self._gram_matrix, self._run_time | |||
| elif len(graphs) == 2: | |||
| # If the inputs are two graphs. | |||
| if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): | |||
| kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) | |||
| if self.copy_graphs: | |||
| G0, G1 = graphs[0].copy(), graphs[1].copy() | |||
| else: | |||
| G0, G1 = graphs[0], graphs[1] | |||
| kernel = self._compute_single_kernel(G0, G1) | |||
| return kernel, self._run_time | |||
| # If the inputs are a graph and a list of graphs. | |||
| elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): | |||
| g1 = graphs[0].copy() | |||
| g_list = [g.copy() for g in graphs[1]] | |||
| kernel_list = self._compute_kernel_list(g1, g_list) | |||
| if self.copy_graphs: | |||
| g1 = graphs[0].copy() | |||
| g_list = [g.copy() for g in graphs[1]] | |||
| kernel_list = self._compute_kernel_list(g1, g_list) | |||
| else: | |||
| kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) | |||
| return kernel_list, self._run_time | |||
| elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): | |||
| g1 = graphs[1].copy() | |||
| g_list = [g.copy() for g in graphs[0]] | |||
| kernel_list = self._compute_kernel_list(g1, g_list) | |||
| if self.copy_graphs: | |||
| g1 = graphs[1].copy() | |||
| g_list = [g.copy() for g in graphs[0]] | |||
| kernel_list = self._compute_kernel_list(g1, g_list) | |||
| else: | |||
| kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) | |||
| return kernel_list, self._run_time | |||
| else: | |||
| raise Exception('Cannot detect graphs.') | |||
| @@ -103,15 +481,15 @@ class GraphKernel(object): | |||
| def _compute_gram_matrix(self): | |||
| start_time = time.time() | |||
| if self._parallel == 'imap_unordered': | |||
| if self.parallel == 'imap_unordered': | |||
| gram_matrix = self._compute_gm_imap_unordered() | |||
| elif self._parallel is None: | |||
| elif self.parallel is None: | |||
| gram_matrix = self._compute_gm_series() | |||
| else: | |||
| raise Exception('Parallel mode is not set correctly.') | |||
| self._run_time = time.time() - start_time | |||
| if self._verbose: | |||
| if self.verbose: | |||
| print('Gram matrix of size %d built in %s seconds.' | |||
| % (len(self._graphs), self._run_time)) | |||
| @@ -129,15 +507,15 @@ class GraphKernel(object): | |||
| def _compute_kernel_list(self, g1, g_list): | |||
| start_time = time.time() | |||
| if self._parallel == 'imap_unordered': | |||
| if self.parallel == 'imap_unordered': | |||
| kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) | |||
| elif self._parallel is None: | |||
| elif self.parallel is None: | |||
| kernel_list = self._compute_kernel_list_series(g1, g_list) | |||
| else: | |||
| raise Exception('Parallel mode is not set correctly.') | |||
| self._run_time = time.time() - start_time | |||
| if self._verbose: | |||
| if self.verbose: | |||
| print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' | |||
| % (len(g_list), self._run_time)) | |||
| @@ -158,7 +536,7 @@ class GraphKernel(object): | |||
| kernel = self._compute_single_kernel_series(g1, g2) | |||
| self._run_time = time.time() - start_time | |||
| if self._verbose: | |||
| if self.verbose: | |||
| print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) | |||
| return kernel | |||
| @@ -185,24 +563,24 @@ class GraphKernel(object): | |||
| return self._graphs | |||
| @property | |||
| def parallel(self): | |||
| return self._parallel | |||
| # @property | |||
| # def parallel(self): | |||
| # return self.parallel | |||
| @property | |||
| def n_jobs(self): | |||
| return self._n_jobs | |||
| # @property | |||
| # def n_jobs(self): | |||
| # return self.n_jobs | |||
| @property | |||
| def verbose(self): | |||
| return self._verbose | |||
| # @property | |||
| # def verbose(self): | |||
| # return self.verbose | |||
| @property | |||
| def normalize(self): | |||
| return self._normalize | |||
| # @property | |||
| # def normalize(self): | |||
| # return self.normalize | |||
| @property | |||
| @@ -46,7 +46,7 @@ class Marginalized(GraphKernel): | |||
| self._add_dummy_labels(self._graphs) | |||
| if self._remove_totters: | |||
| iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| # @todo: this may not work. | |||
| self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | |||
| @@ -57,7 +57,7 @@ class Marginalized(GraphKernel): | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self._verbose >= 2)) | |||
| length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(self._graphs[i], self._graphs[j]) | |||
| gram_matrix[i][j] = kernel | |||
| @@ -70,16 +70,16 @@ class Marginalized(GraphKernel): | |||
| self._add_dummy_labels(self._graphs) | |||
| if self._remove_totters: | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = range(0, len(self._graphs)) | |||
| if len(self._graphs) < 100 * self._n_jobs: | |||
| chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
| if len(self._graphs) < 100 * self.n_jobs: | |||
| chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| remove_fun = self._wrapper_untotter | |||
| iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | |||
| desc='removing tottering', file=sys.stdout, | |||
| length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| for i, g in iterator: | |||
| self._graphs[i] = g | |||
| pool.close() | |||
| @@ -93,7 +93,7 @@ class Marginalized(GraphKernel): | |||
| G_gn = gn_toshare | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| @@ -103,13 +103,13 @@ class Marginalized(GraphKernel): | |||
| if self._remove_totters: | |||
| g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | |||
| iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| # @todo: this may not work. | |||
| g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | |||
| # compute kernel list. | |||
| kernel_list = [None] * len(g_list) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(g1, g_list[i]) | |||
| kernel_list[i] = kernel | |||
| @@ -122,16 +122,16 @@ class Marginalized(GraphKernel): | |||
| if self._remove_totters: | |||
| g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = range(0, len(g_list)) | |||
| if len(g_list) < 100 * self._n_jobs: | |||
| chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
| if len(g_list) < 100 * self.n_jobs: | |||
| chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| remove_fun = self._wrapper_untotter | |||
| iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | |||
| desc='removing tottering', file=sys.stdout, | |||
| length=len(g_list), verbose=(self._verbose >= 2)) | |||
| length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i, g in iterator: | |||
| g_list[i] = g | |||
| pool.close() | |||
| @@ -151,7 +151,7 @@ class Marginalized(GraphKernel): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -5,23 +5,35 @@ Created on Fri Nov 6 10:11:08 2020 | |||
| @author: ljia | |||
| """ | |||
| from gklearn.kernels.common_walk import CommonWalk | |||
| from gklearn.kernels.marginalized import Marginalized | |||
| from gklearn.kernels.sylvester_equation import SylvesterEquation | |||
| from gklearn.kernels.conjugate_gradient import ConjugateGradient | |||
| from gklearn.kernels.fixed_point import FixedPoint | |||
| from gklearn.kernels.spectral_decomposition import SpectralDecomposition | |||
| from gklearn.kernels.shortest_path import ShortestPath | |||
| from gklearn.kernels.structural_sp import StructuralSP | |||
| from gklearn.kernels.path_up_to_h import PathUpToH | |||
| from gklearn.kernels.treelet import Treelet | |||
| from gklearn.kernels.weisfeiler_lehman import WLSubtree | |||
| # The metadata of all graph kernels. | |||
| GRAPH_KERNELS = { | |||
| ### based on walks. | |||
| 'common walk': '', | |||
| 'marginalized': '', | |||
| 'sylvester equation': '', | |||
| 'fixed point': '', | |||
| 'conjugate gradient': '', | |||
| 'spectral decomposition': '', | |||
| 'common walk': CommonWalk, | |||
| 'marginalized': Marginalized, | |||
| 'sylvester equation': SylvesterEquation, | |||
| 'fixed point': FixedPoint, | |||
| 'conjugate gradient': ConjugateGradient, | |||
| 'spectral decomposition': SpectralDecomposition, | |||
| ### based on paths. | |||
| 'shortest path': '', | |||
| 'structural shortest path': '', | |||
| 'path up to length h': '', | |||
| 'shortest path': ShortestPath, | |||
| 'structural shortest path': StructuralSP, | |||
| 'path up to length h': PathUpToH, | |||
| ### based on non-linear patterns. | |||
| 'weisfeiler-lehman subtree': '', | |||
| 'treelet': '', | |||
| 'weisfeiler-lehman subtree': WLSubtree, | |||
| 'treelet': Treelet, | |||
| } | |||
| @@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| from itertools import combinations_with_replacement | |||
| itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', | |||
| file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
| file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| @@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| # get all paths of all graphs before computing kernels to save time, | |||
| # but this may cost a lot of memory for large datasets. | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(self._graphs, range(0, len(self._graphs))) | |||
| if len(self._graphs) < 100 * self._n_jobs: | |||
| chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
| if len(self._graphs) < 100 * self.n_jobs: | |||
| chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| all_paths = [[] for _ in range(len(self._graphs))] | |||
| @@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | |||
| iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | |||
| desc='getting paths', file=sys.stdout, | |||
| length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| for i, ps in iterator: | |||
| all_paths[i] = ps | |||
| pool.close() | |||
| @@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| G_plist = plist_toshare | |||
| do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| @@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._add_dummy_labels(g_list + [g1]) | |||
| iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| kernel_list = [None] * len(g_list) | |||
| @@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| # get all paths of all graphs before computing kernels to save time, | |||
| # but this may cost a lot of memory for large datasets. | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(g_list, range(0, len(g_list))) | |||
| if len(g_list) < 100 * self._n_jobs: | |||
| chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
| if len(g_list) < 100 * self.n_jobs: | |||
| chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| paths_g_list = [[] for _ in range(len(g_list))] | |||
| @@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | |||
| iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | |||
| desc='getting paths', file=sys.stdout, | |||
| length=len(g_list), verbose=(self._verbose >= 2)) | |||
| length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i, ps in iterator: | |||
| paths_g_list[i] = ps | |||
| pool.close() | |||
| @@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
| itr = range(len(g_list)) | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -38,7 +38,7 @@ class ShortestPath(GraphKernel): | |||
| def _compute_gm_series(self): | |||
| self._all_graphs_have_edges(self._graphs) | |||
| # get shortest path graph of each graph. | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
| # compute Gram matrix. | |||
| @@ -48,7 +48,7 @@ class ShortestPath(GraphKernel): | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', | |||
| length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||
| length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._sp_do(self._graphs[i], self._graphs[j]) | |||
| gram_matrix[i][j] = kernel | |||
| @@ -60,16 +60,16 @@ class ShortestPath(GraphKernel): | |||
| def _compute_gm_imap_unordered(self): | |||
| self._all_graphs_have_edges(self._graphs) | |||
| # get shortest path graph of each graph. | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| get_sp_graphs_fun = self._wrapper_get_sp_graphs | |||
| itr = zip(self._graphs, range(0, len(self._graphs))) | |||
| if len(self._graphs) < 100 * self._n_jobs: | |||
| chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
| if len(self._graphs) < 100 * self.n_jobs: | |||
| chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | |||
| desc='getting sp graphs', file=sys.stdout, | |||
| length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| for i, g in iterator: | |||
| self._graphs[i] = g | |||
| pool.close() | |||
| @@ -83,7 +83,7 @@ class ShortestPath(GraphKernel): | |||
| G_gs = gs_toshare | |||
| do_fun = self._wrapper_sp_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| @@ -92,12 +92,12 @@ class ShortestPath(GraphKernel): | |||
| self._all_graphs_have_edges([g1] + g_list) | |||
| # get shortest path graphs of g1 and each graph in g_list. | |||
| g1 = getSPGraph(g1, edge_weight=self._edge_weight) | |||
| iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
| # compute kernel list. | |||
| kernel_list = [None] * len(g_list) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._sp_do(g1, g_list[i]) | |||
| kernel_list[i] = kernel | |||
| @@ -109,16 +109,16 @@ class ShortestPath(GraphKernel): | |||
| self._all_graphs_have_edges([g1] + g_list) | |||
| # get shortest path graphs of g1 and each graph in g_list. | |||
| g1 = getSPGraph(g1, edge_weight=self._edge_weight) | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| get_sp_graphs_fun = self._wrapper_get_sp_graphs | |||
| itr = zip(g_list, range(0, len(g_list))) | |||
| if len(g_list) < 100 * self._n_jobs: | |||
| chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
| if len(g_list) < 100 * self.n_jobs: | |||
| chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | |||
| desc='getting sp graphs', file=sys.stdout, | |||
| length=len(g_list), verbose=(self._verbose >= 2)) | |||
| length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i, g in iterator: | |||
| g_list[i] = g | |||
| pool.close() | |||
| @@ -137,7 +137,7 @@ class ShortestPath(GraphKernel): | |||
| itr = range(len(g_list)) | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| def _compute_gm_series(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
| @@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| # precompute the spectral decomposition of each graph. | |||
| P_list = [] | |||
| D_list = [] | |||
| iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for G in iterator: | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A actually is the transpose of the adjacency matrix. | |||
| @@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) | |||
| @@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| def _compute_gm_imap_unordered(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
| @@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| # precompute the spectral decomposition of each graph. | |||
| P_list = [] | |||
| D_list = [] | |||
| iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for G in iterator: | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A actually is the transpose of the adjacency matrix. | |||
| @@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
| @@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| D1, P1 = np.linalg.eig(A1) | |||
| P_list = [] | |||
| D_list = [] | |||
| iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for G in iterator: | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A actually is the transpose of the adjacency matrix. | |||
| @@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| if self._p is None: # p is uniform distribution as default. | |||
| q_T1 = 1 / nx.number_of_nodes(g1) | |||
| q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) | |||
| @@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
| @@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| D1, P1 = np.linalg.eig(A1) | |||
| P_list = [] | |||
| D_list = [] | |||
| if self._verbose >= 2: | |||
| iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout) | |||
| else: | |||
| iterator = g_list | |||
| for G in iterator: | |||
| @@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| itr = range(len(g_list)) | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
| def _compute_single_kernel_series(self, g1, g2): | |||
| self._check_edge_weight([g1] + [g2], self._verbose) | |||
| self._check_edge_weight([g1] + [g2], self.verbose) | |||
| self._check_graphs([g1] + [g2]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
| @@ -41,7 +41,7 @@ class StructuralSP(GraphKernel): | |||
| def _compute_gm_series(self): | |||
| # get shortest paths of each graph in the graphs. | |||
| splist = [] | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for g in iterator: | |||
| splist.append(self._get_sps_as_trie(g)) | |||
| @@ -56,7 +56,7 @@ class StructuralSP(GraphKernel): | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self._verbose >= 2)) | |||
| length=len_itr, verbose=(self.verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for i, j in iterator: | |||
| kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
| @@ -76,10 +76,10 @@ class StructuralSP(GraphKernel): | |||
| def _compute_gm_imap_unordered(self): | |||
| # get shortest paths of each graph in the graphs. | |||
| splist = [None] * len(self._graphs) | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(self._graphs, range(0, len(self._graphs))) | |||
| if len(self._graphs) < 100 * self._n_jobs: | |||
| chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
| if len(self._graphs) < 100 * self.n_jobs: | |||
| chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| # get shortest path graphs of self._graphs | |||
| @@ -89,7 +89,7 @@ class StructuralSP(GraphKernel): | |||
| get_sps_fun = self._wrapper_get_sps_naive | |||
| iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
| desc='getting shortest paths', file=sys.stdout, | |||
| length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| for i, sp in iterator: | |||
| splist[i] = sp | |||
| pool.close() | |||
| @@ -107,7 +107,7 @@ class StructuralSP(GraphKernel): | |||
| else: | |||
| do_fun = self._wrapper_ssp_do_naive | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| @@ -117,7 +117,7 @@ class StructuralSP(GraphKernel): | |||
| sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | |||
| splist = [] | |||
| iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, | |||
| verbose=(self._verbose >= 2)) | |||
| verbose=(self.verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for g in iterator: | |||
| splist.append(self._get_sps_as_trie(g)) | |||
| @@ -128,7 +128,7 @@ class StructuralSP(GraphKernel): | |||
| # compute kernel list. | |||
| kernel_list = [None] * len(g_list) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', | |||
| file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for i in iterator: | |||
| kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) | |||
| @@ -145,10 +145,10 @@ class StructuralSP(GraphKernel): | |||
| # get shortest paths of g1 and each graph in g_list. | |||
| sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | |||
| splist = [None] * len(g_list) | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(g_list, range(0, len(g_list))) | |||
| if len(g_list) < 100 * self._n_jobs: | |||
| chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
| if len(g_list) < 100 * self.n_jobs: | |||
| chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| # get shortest path graphs of g_list | |||
| @@ -158,7 +158,7 @@ class StructuralSP(GraphKernel): | |||
| get_sps_fun = self._wrapper_get_sps_naive | |||
| iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
| desc='getting shortest paths', file=sys.stdout, | |||
| length=len(g_list), verbose=(self._verbose >= 2)) | |||
| length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i, sp in iterator: | |||
| splist[i] = sp | |||
| pool.close() | |||
| @@ -182,7 +182,7 @@ class StructuralSP(GraphKernel): | |||
| itr = range(len(g_list)) | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -14,6 +14,7 @@ import sys | |||
| from gklearn.utils import get_iters | |||
| import numpy as np | |||
| import networkx as nx | |||
| from control import dlyap | |||
| from gklearn.utils.parallel import parallel_gm, parallel_me | |||
| from gklearn.kernels import RandomWalkMeta | |||
| @@ -22,14 +23,13 @@ class SylvesterEquation(RandomWalkMeta): | |||
| def __init__(self, **kwargs): | |||
| from control import dlyap | |||
| super().__init__(**kwargs) | |||
| def _compute_gm_series(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored.') | |||
| @@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| if self._q is None: | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A_wave_list actually contains the transposes of the adjacency matrices. | |||
| iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
| # # normalized adjacency matrices | |||
| # A_wave_list = [] | |||
| @@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) | |||
| @@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
| def _compute_gm_imap_unordered(self): | |||
| self._check_edge_weight(self._graphs, self._verbose) | |||
| self._check_edge_weight(self._graphs, self.verbose) | |||
| self._check_graphs(self._graphs) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored.') | |||
| @@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| if self._q is None: | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A_wave_list actually contains the transposes of the adjacency matrices. | |||
| iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
| if self._p is None: # p is uniform distribution as default. | |||
| @@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored.') | |||
| @@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta): | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A_wave_list actually contains the transposes of the adjacency matrices. | |||
| A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
| iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
| if self._p is None: # p is uniform distribution as default. | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) | |||
| @@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
| def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
| self._check_edge_weight(g_list + [g1], self._verbose) | |||
| self._check_edge_weight(g_list + [g1], self.verbose) | |||
| self._check_graphs(g_list + [g1]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored.') | |||
| @@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| # don't normalize adjacency matrices if q is a uniform vector. Note | |||
| # A_wave_list actually contains the transposes of the adjacency matrices. | |||
| A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
| iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
| if self._p is None: # p is uniform distribution as default. | |||
| @@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| else: # @todo | |||
| pass | |||
| @@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
| def _compute_single_kernel_series(self, g1, g2): | |||
| self._check_edge_weight([g1] + [g2], self._verbose) | |||
| self._check_edge_weight([g1] + [g2], self.verbose) | |||
| self._check_graphs([g1] + [g2]) | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('All labels are ignored.') | |||
| @@ -18,6 +18,8 @@ import numpy as np | |||
| import networkx as nx | |||
| from collections import Counter | |||
| from itertools import chain | |||
| from sklearn.utils.validation import check_is_fitted | |||
| from sklearn.exceptions import NotFittedError | |||
| from gklearn.utils import SpecialLabel | |||
| from gklearn.utils.parallel import parallel_gm, parallel_me | |||
| from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs | |||
| @@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel | |||
| class Treelet(GraphKernel): | |||
| def __init__(self, **kwargs): | |||
| GraphKernel.__init__(self) | |||
| self._node_labels = kwargs.get('node_labels', []) | |||
| self._edge_labels = kwargs.get('edge_labels', []) | |||
| self._sub_kernel = kwargs.get('sub_kernel', None) | |||
| self._ds_infos = kwargs.get('ds_infos', {}) | |||
| if self._sub_kernel is None: | |||
| raise Exception('Sub kernel not set.') | |||
| def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): | |||
| """Initialise a treelet kernel. | |||
| """ | |||
| super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) | |||
| self.node_labels = kwargs.get('node_labels', []) | |||
| self.edge_labels = kwargs.get('edge_labels', []) | |||
| self.sub_kernel = kwargs.get('sub_kernel', None) | |||
| self.ds_infos = kwargs.get('ds_infos', {}) | |||
| self.precompute_canonkeys = precompute_canonkeys | |||
| self.save_canonkeys = save_canonkeys | |||
| ########################################################################## | |||
| # The following is the 1st paradigm to compute kernel matrix, which is | |||
| # compatible with `scikit-learn`. | |||
| # ------------------------------------------------------------------- | |||
| # Special thanks to the "GraKeL" library for providing an excellent template! | |||
| ########################################################################## | |||
| def clear_attributes(self): | |||
| super().clear_attributes() | |||
| if hasattr(self, '_canonkeys'): | |||
| delattr(self, '_canonkeys') | |||
| if hasattr(self, '_Y_canonkeys'): | |||
| delattr(self, '_Y_canonkeys') | |||
| if hasattr(self, '_dummy_labels_considered'): | |||
| delattr(self, '_dummy_labels_considered') | |||
| def validate_parameters(self): | |||
| """Validate all parameters for the transformer. | |||
| Returns | |||
| ------- | |||
| None. | |||
| """ | |||
| super().validate_parameters() | |||
| if self.sub_kernel is None: | |||
| raise ValueError('Sub-kernel not set.') | |||
| def _compute_kernel_matrix_series(self, Y): | |||
| """Compute the kernel matrix between a given target graphs (Y) and | |||
| the fitted graphs (X / self._graphs) without parallelization. | |||
| Parameters | |||
| ---------- | |||
| Y : list of graphs, optional | |||
| The target graphs. | |||
| Returns | |||
| ------- | |||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
| The computed kernel matrix. | |||
| """ | |||
| # self._add_dummy_labels will modify the input in place. | |||
| self._add_dummy_labels() # For self._graphs | |||
| # Y = [g.copy() for g in Y] # @todo: ? | |||
| self._add_dummy_labels(Y) | |||
| # get all canonical keys of all graphs before computing kernels to save | |||
| # time, but this may cost a lot of memory for large dataset. | |||
| # Canonical keys for self._graphs. | |||
| try: | |||
| check_is_fitted(self, ['_canonkeys']) | |||
| canonkeys_list1 = self._canonkeys | |||
| except NotFittedError: | |||
| canonkeys_list1 = [] | |||
| iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for g in iterator: | |||
| canonkeys_list1.append(self._get_canonkeys(g)) | |||
| if self.save_canonkeys: | |||
| self._canonkeys = canonkeys_list1 | |||
| # Canonical keys for Y. | |||
| canonkeys_list2 = [] | |||
| iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for g in iterator: | |||
| canonkeys_list2.append(self._get_canonkeys(g)) | |||
| if self.save_canonkeys: | |||
| self._Y_canonkeys = canonkeys_list2 | |||
| # compute kernel matrix. | |||
| kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) | |||
| from itertools import product | |||
| itr = product(range(len(Y)), range(len(canonkeys_list1))) | |||
| len_itr = int(len(Y) * len(canonkeys_list1)) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i_y, i_x in iterator: | |||
| kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x]) | |||
| kernel_matrix[i_y][i_x] = kernel | |||
| return kernel_matrix | |||
| def _compute_kernel_matrix_imap_unordered(self, Y): | |||
| """Compute the kernel matrix between a given target graphs (Y) and | |||
| the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||
| Parameters | |||
| ---------- | |||
| Y : list of graphs, optional | |||
| The target graphs. | |||
| Returns | |||
| ------- | |||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
| The computed kernel matrix. | |||
| """ | |||
| raise Exception('Parallelization for kernel matrix is not implemented.') | |||
| def pairwise_kernel(self, x, y, are_keys=False): | |||
| """Compute pairwise kernel between two graphs. | |||
| Parameters | |||
| ---------- | |||
| x, y : NetworkX Graph. | |||
| Graphs bewteen which the kernel is computed. | |||
| are_keys : boolean, optional | |||
| If `True`, `x` and `y` are canonical keys, otherwise are graphs. | |||
| The default is False. | |||
| Returns | |||
| ------- | |||
| kernel: float | |||
| The computed kernel. | |||
| """ | |||
| if are_keys: | |||
| # x, y are canonical keys. | |||
| kernel = self._kernel_do(x, y) | |||
| else: | |||
| # x, y are graphs. | |||
| kernel = self._compute_single_kernel_series(x, y) | |||
| return kernel | |||
| def diagonals(self): | |||
| """Compute the kernel matrix diagonals of the fit/transformed data. | |||
| Returns | |||
| ------- | |||
| X_diag : numpy array | |||
| The diagonal of the kernel matrix between the fitted data. | |||
| This consists of each element calculated with itself. | |||
| Y_diag : numpy array | |||
| The diagonal of the kernel matrix, of the transform. | |||
| This consists of each element calculated with itself. | |||
| """ | |||
| # Check if method "fit" had been called. | |||
| check_is_fitted(self, ['_graphs']) | |||
| # Check if the diagonals of X exist. | |||
| try: | |||
| check_is_fitted(self, ['_X_diag']) | |||
| except NotFittedError: | |||
| # Compute diagonals of X. | |||
| self._X_diag = np.empty(shape=(len(self._graphs),)) | |||
| try: | |||
| check_is_fitted(self, ['_canonkeys']) | |||
| for i, x in enumerate(self._canonkeys): | |||
| self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel? | |||
| except NotFittedError: | |||
| for i, x in enumerate(self._graphs): | |||
| self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel? | |||
| try: | |||
| # If transform has happened, return both diagonals. | |||
| check_is_fitted(self, ['_Y']) | |||
| self._Y_diag = np.empty(shape=(len(self._Y),)) | |||
| try: | |||
| check_is_fitted(self, ['_Y_canonkeys']) | |||
| for (i, y) in enumerate(self._Y_canonkeys): | |||
| self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel? | |||
| except NotFittedError: | |||
| for (i, y) in enumerate(self._Y): | |||
| self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel? | |||
| return self._X_diag, self._Y_diag | |||
| except NotFittedError: | |||
| # Else just return both X_diag | |||
| return self._X_diag | |||
| ########################################################################## | |||
| # The following is the 2nd paradigm to compute kernel matrix. It is | |||
| # simplified and not compatible with `scikit-learn`. | |||
| ########################################################################## | |||
| def _compute_gm_series(self): | |||
| @@ -43,10 +242,13 @@ class Treelet(GraphKernel): | |||
| # time, but this may cost a lot of memory for large dataset. | |||
| canonkeys = [] | |||
| iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, | |||
| verbose=(self._verbose >= 2)) | |||
| verbose=(self.verbose >= 2)) | |||
| for g in iterator: | |||
| canonkeys.append(self._get_canonkeys(g)) | |||
| if self.save_canonkeys: | |||
| self._canonkeys = canonkeys | |||
| # compute Gram matrix. | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| @@ -54,7 +256,7 @@ class Treelet(GraphKernel): | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self._verbose >= 2)) | |||
| length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| kernel = self._kernel_do(canonkeys[i], canonkeys[j]) | |||
| gram_matrix[i][j] = kernel | |||
| @@ -68,22 +270,25 @@ class Treelet(GraphKernel): | |||
| # get all canonical keys of all graphs before computing kernels to save | |||
| # time, but this may cost a lot of memory for large dataset. | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(self._graphs, range(0, len(self._graphs))) | |||
| if len(self._graphs) < 100 * self._n_jobs: | |||
| chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
| if len(self._graphs) < 100 * self.n_jobs: | |||
| chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| canonkeys = [[] for _ in range(len(self._graphs))] | |||
| get_fun = self._wrapper_get_canonkeys | |||
| iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | |||
| desc='getting canonkeys', file=sys.stdout, | |||
| length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
| length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
| for i, ck in iterator: | |||
| canonkeys[i] = ck | |||
| pool.close() | |||
| pool.join() | |||
| if self.save_canonkeys: | |||
| self._canonkeys = canonkeys | |||
| # compute Gram matrix. | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| @@ -92,25 +297,25 @@ class Treelet(GraphKernel): | |||
| G_canonkeys = canonkeys_toshare | |||
| do_fun = self._wrapper_kernel_do | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| def _compute_kernel_list_series(self, g1, g_list): | |||
| self._add_dummy_labels(g_list + [g1]) | |||
| # self._add_dummy_labels(g_list + [g1]) | |||
| # get all canonical keys of all graphs before computing kernels to save | |||
| # time, but this may cost a lot of memory for large dataset. | |||
| canonkeys_1 = self._get_canonkeys(g1) | |||
| canonkeys_list = [] | |||
| iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
| for g in iterator: | |||
| canonkeys_list.append(self._get_canonkeys(g)) | |||
| # compute kernel list. | |||
| kernel_list = [None] * len(g_list) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
| iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i in iterator: | |||
| kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) | |||
| kernel_list[i] = kernel | |||
| @@ -125,16 +330,16 @@ class Treelet(GraphKernel): | |||
| # time, but this may cost a lot of memory for large dataset. | |||
| canonkeys_1 = self._get_canonkeys(g1) | |||
| canonkeys_list = [[] for _ in range(len(g_list))] | |||
| pool = Pool(self._n_jobs) | |||
| pool = Pool(self.n_jobs) | |||
| itr = zip(g_list, range(0, len(g_list))) | |||
| if len(g_list) < 100 * self._n_jobs: | |||
| chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
| if len(g_list) < 100 * self.n_jobs: | |||
| chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| get_fun = self._wrapper_get_canonkeys | |||
| iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | |||
| desc='getting canonkeys', file=sys.stdout, | |||
| length=len(g_list), verbose=(self._verbose >= 2)) | |||
| length=len(g_list), verbose=(self.verbose >= 2)) | |||
| for i, ck in iterator: | |||
| canonkeys_list[i] = ck | |||
| pool.close() | |||
| @@ -154,7 +359,7 @@ class Treelet(GraphKernel): | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| @@ -164,13 +369,13 @@ class Treelet(GraphKernel): | |||
| def _compute_single_kernel_series(self, g1, g2): | |||
| self._add_dummy_labels([g1] + [g2]) | |||
| # self._add_dummy_labels([g1] + [g2]) | |||
| canonkeys_1 = self._get_canonkeys(g1) | |||
| canonkeys_2 = self._get_canonkeys(g2) | |||
| kernel = self._kernel_do(canonkeys_1, canonkeys_2) | |||
| return kernel | |||
| # @profile | |||
| def _kernel_do(self, canonkey1, canonkey2): | |||
| """Compute treelet graph kernel between 2 graphs. | |||
| @@ -187,7 +392,24 @@ class Treelet(GraphKernel): | |||
| keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||
| vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | |||
| vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | |||
| kernel = self._sub_kernel(vector1, vector2) | |||
| # vector1, vector2 = [], [] | |||
| # keys1, keys2 = canonkey1, canonkey2 | |||
| # keys_searched = {} | |||
| # for k, v in canonkey1.items(): | |||
| # if k in keys2: | |||
| # vector1.append(v) | |||
| # vector2.append(canonkey2[k]) | |||
| # keys_searched[k] = v | |||
| # for k, v in canonkey2.items(): | |||
| # if k in keys1 and k not in keys_searched: | |||
| # vector1.append(canonkey1[k]) | |||
| # vector2.append(v) | |||
| # vector1, vector2 = np.array(vector1), np.array(vector2) | |||
| kernel = self.sub_kernel(vector1, vector2) | |||
| return kernel | |||
| @@ -223,7 +445,7 @@ class Treelet(GraphKernel): | |||
| patterns['0'] = list(G.nodes()) | |||
| canonkey['0'] = nx.number_of_nodes(G) | |||
| for i in range(1, 6): # for i in range(1, 6): | |||
| patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) | |||
| patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed']) | |||
| canonkey[str(i)] = len(patterns[str(i)]) | |||
| # n-star patterns | |||
| @@ -317,11 +539,11 @@ class Treelet(GraphKernel): | |||
| ### pattern obtained in the structural analysis section above, which is a | |||
| ### string corresponding to a unique treelet. A dictionary is built to keep | |||
| ### track of the amount of every treelet. | |||
| if len(self._node_labels) > 0 or len(self._edge_labels) > 0: | |||
| if len(self.node_labels) > 0 or len(self.edge_labels) > 0: | |||
| canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | |||
| # linear patterns | |||
| canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) | |||
| canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels)) | |||
| for key in canonkey_t: | |||
| canonkey_l[('0', key)] = canonkey_t[key] | |||
| @@ -330,9 +552,9 @@ class Treelet(GraphKernel): | |||
| for pattern in patterns[str(i)]: | |||
| canonlist = [] | |||
| for idx, node in enumerate(pattern[:-1]): | |||
| canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels)) | |||
| canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels)) | |||
| canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels)) | |||
| canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels)) | |||
| canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels)) | |||
| canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels)) | |||
| canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] | |||
| treelet.append(tuple([str(i)] + canonkey_t)) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -343,13 +565,13 @@ class Treelet(GraphKernel): | |||
| for pattern in patterns[str(i) + 'star']: | |||
| canonlist = [] | |||
| for leaf in pattern[1:]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
| canonlist.append(tuple((nlabels, elabels))) | |||
| canonlist.sort() | |||
| canonlist = list(chain.from_iterable(canonlist)) | |||
| canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + | |||
| [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
| [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
| + canonlist) | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -359,17 +581,17 @@ class Treelet(GraphKernel): | |||
| for pattern in patterns['7']: | |||
| canonlist = [] | |||
| for leaf in pattern[1:3]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
| canonlist.append(tuple((nlabels, elabels))) | |||
| canonlist.sort() | |||
| canonlist = list(chain.from_iterable(canonlist)) | |||
| canonkey_t = tuple(['7'] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]) | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -378,38 +600,38 @@ class Treelet(GraphKernel): | |||
| for pattern in patterns['11']: | |||
| canonlist = [] | |||
| for leaf in pattern[1:4]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
| canonlist.append(tuple((nlabels, elabels))) | |||
| canonlist.sort() | |||
| canonlist = list(chain.from_iterable(canonlist)) | |||
| canonkey_t = tuple(['b'] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)] | |||
| + [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]) | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 10 | |||
| treelet = [] | |||
| for pattern in patterns['10']: | |||
| canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||
| tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)] | |||
| canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||
| tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)] | |||
| canonlist = [] | |||
| for leaf in pattern[1:3]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
| canonlist.append(tuple((nlabels, elabels))) | |||
| canonlist.sort() | |||
| canonkey0 = list(chain.from_iterable(canonlist)) | |||
| canonkey_t = tuple(['a'] | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
| + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||
| + canonkey4 + canonkey0) | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -419,15 +641,15 @@ class Treelet(GraphKernel): | |||
| for pattern in patterns['12']: | |||
| canonlist0 = [] | |||
| for leaf in pattern[1:3]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
| canonlist0.append(tuple((nlabels, elabels))) | |||
| canonlist0.sort() | |||
| canonlist0 = list(chain.from_iterable(canonlist0)) | |||
| canonlist3 = [] | |||
| for leaf in pattern[4:6]: | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
| elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels) | |||
| nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
| elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels) | |||
| canonlist3.append(tuple((nlabels, elabels))) | |||
| canonlist3.sort() | |||
| canonlist3 = list(chain.from_iterable(canonlist3)) | |||
| @@ -435,14 +657,14 @@ class Treelet(GraphKernel): | |||
| # 2 possible key can be generated from 2 nodes with extended label 3, | |||
| # select the one with lower lexicographic order. | |||
| canonkey_t1 = tuple(['c'] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0 | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
| + canonlist3) | |||
| canonkey_t2 = tuple(['c'] | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
| + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||
| + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3 | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
| + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||
| + canonlist0) | |||
| treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -450,24 +672,24 @@ class Treelet(GraphKernel): | |||
| # pattern 9 | |||
| treelet = [] | |||
| for pattern in patterns['9']: | |||
| canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels), | |||
| tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)] | |||
| canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||
| tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] | |||
| prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), | |||
| tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] | |||
| prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), | |||
| tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
| canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels), | |||
| tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)] | |||
| canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||
| tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)] | |||
| prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels), | |||
| tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)] | |||
| prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels), | |||
| tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
| if prekey2 + canonkey2 < prekey3 + canonkey3: | |||
| canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||
| + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||
| canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||
| + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||
| + prekey2 + prekey3 + canonkey2 + canonkey3 | |||
| else: | |||
| canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||
| + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||
| canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||
| + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||
| + prekey3 + prekey2 + canonkey3 + canonkey2 | |||
| treelet.append(tuple(['9'] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
| + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
| + canonkey_t)) | |||
| canonkey_l.update(Counter(treelet)) | |||
| @@ -482,12 +704,33 @@ class Treelet(GraphKernel): | |||
| return i, self._get_canonkeys(g) | |||
| def _add_dummy_labels(self, Gn): | |||
| if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||
| for i in range(len(Gn)): | |||
| nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
| self._node_labels = [SpecialLabel.DUMMY] | |||
| if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY): | |||
| for i in range(len(Gn)): | |||
| nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
| self._edge_labels = [SpecialLabel.DUMMY] | |||
| def _add_dummy_labels(self, Gn=None): | |||
| def _add_dummy(Gn): | |||
| if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||
| for i in range(len(Gn)): | |||
| nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
| self.node_labels = [SpecialLabel.DUMMY] | |||
| if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY): | |||
| for i in range(len(Gn)): | |||
| nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
| self.edge_labels = [SpecialLabel.DUMMY] | |||
| if Gn is None or Gn is self._graphs: | |||
| # Add dummy labels for the copy of self._graphs. | |||
| try: | |||
| check_is_fitted(self, ['_dummy_labels_considered']) | |||
| if not self._dummy_labels_considered: | |||
| Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||
| _add_dummy(Gn) | |||
| self._graphs = Gn | |||
| self._dummy_labels_considered = True | |||
| except NotFittedError: | |||
| Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||
| _add_dummy(Gn) | |||
| self._graphs = Gn | |||
| self._dummy_labels_considered = True | |||
| else: | |||
| # Add dummy labels for the input. | |||
| _add_dummy(Gn) | |||
| @@ -14,30 +14,48 @@ Created on Tue Apr 14 15:16:34 2020 | |||
| import numpy as np | |||
| import networkx as nx | |||
| import sys | |||
| from collections import Counter | |||
| # from functools import partial | |||
| from itertools import combinations_with_replacement | |||
| from gklearn.utils import SpecialLabel | |||
| from gklearn.utils.parallel import parallel_gm, parallel_me | |||
| from gklearn.kernels import GraphKernel | |||
| from gklearn.utils.iters import get_iters | |||
| class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| def __init__(self, **kwargs): | |||
| GraphKernel.__init__(self) | |||
| self._node_labels = kwargs.get('node_labels', []) | |||
| self._edge_labels = kwargs.get('edge_labels', []) | |||
| self._height = int(kwargs.get('height', 0)) | |||
| self.node_labels = kwargs.get('node_labels', []) | |||
| self.edge_labels = kwargs.get('edge_labels', []) | |||
| self.height = int(kwargs.get('height', 0)) | |||
| self._base_kernel = kwargs.get('base_kernel', 'subtree') | |||
| self._ds_infos = kwargs.get('ds_infos', {}) | |||
| ########################################################################## | |||
| # The following is the 1st paradigm to compute kernel matrix, which is | |||
| # compatible with `scikit-learn`. | |||
| # ------------------------------------------------------------------- | |||
| # Special thanks to the "GraKeL" library for providing an excellent template! | |||
| ########################################################################## | |||
| ########################################################################## | |||
| # The following is the 2nd paradigm to compute kernel matrix. It is | |||
| # simplified and not compatible with `scikit-learn`. | |||
| ########################################################################## | |||
| def _compute_gm_series(self): | |||
| # if self._verbose >= 2: | |||
| # if self.verbose >= 2: | |||
| # import warnings | |||
| # warnings.warn('A part of the computation is parallelized.') | |||
| self._add_dummy_node_labels(self._graphs) | |||
| # self._add_dummy_node_labels(self._graphs) | |||
| # for WL subtree kernel | |||
| if self._base_kernel == 'subtree': | |||
| @@ -59,7 +77,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| def _compute_gm_imap_unordered(self): | |||
| self._add_dummy_node_labels(self._graphs) | |||
| # self._add_dummy_node_labels(self._graphs) | |||
| if self._base_kernel == 'subtree': | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| @@ -74,17 +92,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| G_gn = gn_toshare | |||
| do_fun = self._wrapper_pairwise | |||
| parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
| glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| return gram_matrix | |||
| else: | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | |||
| return self._compute_gm_series() | |||
| def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. | |||
| # if self._verbose >= 2: | |||
| # if self.verbose >= 2: | |||
| # import warnings | |||
| # warnings.warn('A part of the computation is parallelized.') | |||
| @@ -126,10 +144,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| len_itr = len(g_list) | |||
| parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
| init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
| n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
| n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
| return kernel_list | |||
| else: | |||
| if self._verbose >= 2: | |||
| if self.verbose >= 2: | |||
| import warnings | |||
| warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | |||
| return self._compute_kernel_list_series(g1, g_list) | |||
| @@ -160,6 +178,30 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| return gram_matrix[0][1] | |||
| ########################################################################## | |||
| # The following are the methods used by both diagrams. | |||
| ########################################################################## | |||
| def validate_parameters(self): | |||
| """Validate all parameters for the transformer. | |||
| Returns | |||
| ------- | |||
| None. | |||
| """ | |||
| super().validate_parameters() | |||
| if len(self.node_labels) == 0: | |||
| if len(self.edge_labels) == 0: | |||
| self._subtree_kernel_do = self._subtree_kernel_do_unlabeled | |||
| else: | |||
| self._subtree_kernel_do = self._subtree_kernel_do_el | |||
| else: | |||
| if len(self.edge_labels) == 0: | |||
| self._subtree_kernel_do = self._subtree_kernel_do_nl | |||
| else: | |||
| self._subtree_kernel_do = self._subtree_kernel_do_labeled | |||
| def pairwise_kernel(self, g1, g2): | |||
| Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | |||
| @@ -172,9 +214,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| for G in Gn: | |||
| # set all labels into a tuple. | |||
| for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
| G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||
| G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
| # get the set of original labels | |||
| labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
| labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
| # number of occurence of each label in G | |||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
| @@ -182,22 +224,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | |||
| # iterate each height | |||
| for h in range(1, self._height + 1): | |||
| for h in range(1, self.height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for idx, G in enumerate(Gn): | |||
| for G in Gn: | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||
| multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs['label_tuple']] + multiset # add the prefix | |||
| multiset = [attrs['lt']] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # label compression | |||
| @@ -208,19 +250,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| # else assign the number of labels occured + 1 as the compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({value: all_set_compressed[value]}) | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
| set_compressed[value] = str(num_of_labels_occured + 1) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
| # get the set of compressed labels | |||
| labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
| # all_labels_ori.update(labels_comp) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| @@ -249,8 +291,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| return kernel | |||
| def _subtree_kernel_do(self, Gn): | |||
| """Compute Weisfeiler-Lehman kernels between graphs. | |||
| def _subtree_kernel_do_nl(self, Gn): | |||
| """Compute Weisfeiler-Lehman kernels between graphs with node labels. | |||
| Parameters | |||
| ---------- | |||
| @@ -268,12 +310,16 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
| # for each graph | |||
| for G in Gn: | |||
| # set all labels into a tuple. | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||
| else: | |||
| iterator = Gn | |||
| for G in iterator: | |||
| # set all labels into a tuple. # @todo: remove this original labels or not? | |||
| for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
| G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||
| G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
| # get the set of original labels | |||
| labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
| labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
| # number of occurence of each label in G | |||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
| @@ -281,74 +327,398 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # iterate each height | |||
| for h in range(1, self._height + 1): | |||
| for h in range(1, self.height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for idx, G in enumerate(Gn): | |||
| # if self.verbose >= 2: | |||
| # iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn)) | |||
| # else: | |||
| # iterator = enumerate(Gn) | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs['label_tuple']] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # if a label occured before, assign its former compressed label, | |||
| # else assign the number of labels occured + 1 as the compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({value: all_set_compressed[value]}) | |||
| else: | |||
| set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
| num_of_labels_occured += 1 | |||
| return gram_matrix | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||
| def _subtree_kernel_do_el(self, Gn): | |||
| """Compute Weisfeiler-Lehman kernels between graphs with edge labels. | |||
| # get the set of compressed labels | |||
| labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
| # all_labels_ori.update(labels_comp) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are computed. | |||
| # Compute subtree kernel with h iterations and add it to the final kernel | |||
| Return | |||
| ------ | |||
| gram_matrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
| # initial for height = 0 | |||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
| iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
| for i, j in iterator: | |||
| gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| # if h >= 1. | |||
| if self.height > 0: | |||
| # Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||
| else: | |||
| iterator = Gn | |||
| for G in iterator: | |||
| for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||
| G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||
| # When h == 1, compute the kernel. | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # Iterate along heights (>= 2). | |||
| for h in range(2, self.height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| return gram_matrix | |||
| def _subtree_kernel_do_labeled(self, Gn): | |||
| """Compute Weisfeiler-Lehman kernels between graphs with both node and | |||
| edge labels. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are computed. | |||
| Return | |||
| ------ | |||
| gram_matrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
| # initial for height = 0 | |||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
| # Set all node labels into a tuple and get # of occurence of each label. | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(Gn, desc='Setting all node labels into a tuple') | |||
| else: | |||
| iterator = Gn | |||
| for G in iterator: | |||
| # Set all node labels into a tuple. # @todo: remove this original labels or not? | |||
| for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
| G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
| # Get the set of original labels. | |||
| labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
| # number of occurence of each label in G | |||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # if h >= 1. | |||
| if self.height > 0: | |||
| # Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||
| if self.verbose >= 2: | |||
| iterator = get_iters(Gn, desc='Setting all edge labels into a tuple') | |||
| else: | |||
| iterator = Gn | |||
| for G in iterator: | |||
| for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||
| G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||
| # When h == 1, compute the kernel. | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # Iterate along heights. | |||
| for h in range(2, self.height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| return gram_matrix | |||
| def _subtree_kernel_do_unlabeled(self, Gn): | |||
| """Compute Weisfeiler-Lehman kernels between graphs without labels. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are computed. | |||
| Return | |||
| ------ | |||
| gram_matrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
| # initial for height = 0 | |||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
| iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
| for i, j in iterator: | |||
| gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| # if h >= 1. | |||
| if self.height > 0: | |||
| # When h == 1, compute the kernel. | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| # Iterate along heights (>= 2). | |||
| for h in range(2, self.height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # @todo: parallel this part. | |||
| for G in Gn: | |||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
| # Compute subtree kernel with h iterations and add it to the final kernel. | |||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
| return gram_matrix | |||
| def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs['lt']] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # If a label occured before, assign its former compressed label; | |||
| # otherwise assign the number of labels occured + 1 as the | |||
| # compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big. | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # Relabel nodes. | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
| # Get the set of compressed labels. | |||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| return num_of_labels_occured | |||
| def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
| all_multisets = [] | |||
| # for node, attrs in G.nodes(data=True): | |||
| for node in G.nodes(): | |||
| # Multiset-label determination. | |||
| multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this. | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| # multiset = [attrs['lt']] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # If a label occured before, assign its former compressed label; | |||
| # otherwise assign the number of labels occured + 1 as the | |||
| # compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # Relabel nodes. | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
| # Get the set of compressed labels. | |||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster. | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| return num_of_labels_occured | |||
| def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this. | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs['lt']] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # If a label occured before, assign its former compressed label; | |||
| # otherwise assign the number of labels occured + 1 as the | |||
| # compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # Relabel nodes. | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
| # Get the set of compressed labels. | |||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| return num_of_labels_occured | |||
| def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
| # all_multisets = [] | |||
| # for node, attrs in G.nodes(data=True): # @todo: it can be better. | |||
| # # Multiset-label determination. | |||
| # multiset = [0 for neighbors in G[node]] | |||
| # # sorting each multiset | |||
| # multiset.sort() | |||
| # multiset = [0] + multiset # add the prefix | |||
| # all_multisets.append(tuple(multiset)) | |||
| all_multisets = [len(G[node]) for node in G.nodes()] | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # If a label occured before, assign its former compressed label; | |||
| # otherwise assign the number of labels occured + 1 as the | |||
| # compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # Relabel nodes. | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
| # Get the set of compressed labels. | |||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| return num_of_labels_occured | |||
| def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): | |||
| """Compute Gram matrix using the base kernel. | |||
| """ | |||
| # if self._parallel == 'imap_unordered': | |||
| # if self.parallel == 'imap_unordered': | |||
| # # compute kernels. | |||
| # def init_worker(alllabels_toshare): | |||
| # global G_alllabels | |||
| # G_alllabels = alllabels_toshare | |||
| # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) | |||
| # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, | |||
| # glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
| # elif self._parallel is None: | |||
| for i in range(len(gram_matrix)): | |||
| for j in range(i, len(gram_matrix)): | |||
| gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], | |||
| all_num_of_each_label[j], gram_matrix[i][j]) | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): | |||
| # glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
| # elif self.parallel is None: | |||
| itr = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
| len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
| for i, j in iterator: | |||
| # for i in iterator: | |||
| # for j in range(i, len(gram_matrix)): | |||
| gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i], | |||
| all_num_of_each_label[j]) | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): | |||
| """Compute the subtree kernel. | |||
| """ | |||
| labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | |||
| @@ -358,7 +728,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| vector2 = np.array([(num_of_each_label2[label] | |||
| if (label in num_of_each_label2.keys()) else 0) | |||
| for label in labels]) | |||
| kernel += np.dot(vector1, vector2) | |||
| kernel = np.dot(vector1, vector2) | |||
| return kernel | |||
| @@ -426,9 +796,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| set_compressed[value] = str(num_of_labels_occured + 1) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| @@ -504,9 +874,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| set_compressed[value] = str(num_of_labels_occured + 1) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| @@ -577,9 +947,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| set_compressed[value] = all_set_compressed[value] | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| set_compressed[value] = str(num_of_labels_occured + 1) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| @@ -595,10 +965,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
| def _add_dummy_node_labels(self, Gn): | |||
| if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||
| if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||
| for i in range(len(Gn)): | |||
| nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
| self._node_labels = [SpecialLabel.DUMMY] | |||
| self.node_labels = [SpecialLabel.DUMMY] | |||
| class WLSubtree(WeisfeilerLehman): | |||
| @@ -0,0 +1,14 @@ | |||
| # -*-coding:utf-8 -*- | |||
| """ | |||
| model learning. | |||
| """ | |||
| # info | |||
| __version__ = "0.2" | |||
| __author__ = "Linlin Jia" | |||
| __date__ = "November 2020" | |||
| from gklearn.model_learning.nested_cv import NestedCV | |||
| from gklearn.model_learning.workflow import Workflow | |||
| from gklearn.model_learning.parameters import dichotomous_permutation | |||
| @@ -0,0 +1,714 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Nov 27 18:59:28 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import datetime | |||
| import time | |||
| import sys | |||
| from tqdm import tqdm | |||
| from multiprocessing import Pool, Array | |||
| from functools import partial | |||
| import numpy as np | |||
| from matplotlib import pyplot as plt | |||
| from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
| from sklearn.kernel_ridge import KernelRidge | |||
| from sklearn.svm import SVC | |||
| from sklearn.metrics import accuracy_score, mean_squared_error | |||
| class NestedCV(object): | |||
| """Perform model selection, fitting and testing for precomputed kernels | |||
| using nested CV. Print out neccessary data during the process then finally | |||
| the results. | |||
| Parameters | |||
| ---------- | |||
| datafile : string | |||
| Path of dataset file. | |||
| estimator : function | |||
| kernel function used to estimate. This function needs to return a gram matrix. | |||
| param_grid_precomputed : dictionary | |||
| Dictionary with names (string) of parameters used to calculate gram | |||
| matrices as keys and lists of parameter settings to try as values. This | |||
| enables searching over any sequence of parameter settings. Params with | |||
| length 1 will be omitted. | |||
| param_grid : dictionary | |||
| Dictionary with names (string) of parameters used as penelties as keys | |||
| and lists of parameter settings to try as values. This enables | |||
| searching over any sequence of parameter settings. Params with length 1 | |||
| will be omitted. | |||
| model_type : string | |||
| Type of the problem, can be 'regression' or 'classification'. | |||
| NUM_TRIALS : integer | |||
| Number of random trials of the outer CV loop. The default is 30. | |||
| datafile_y : string | |||
| Path of file storing y data. This parameter is optional depending on | |||
| the given dataset file. | |||
| extra_params : dict | |||
| Extra parameters for loading dataset. See function gklearn.utils. | |||
| graphfiles.loadDataset for detail. | |||
| ds_name : string | |||
| Name of the dataset. | |||
| n_jobs : int | |||
| Number of jobs for parallelization. | |||
| read_gm_from_file : boolean | |||
| Whether gram matrices are loaded from a file. | |||
| Examples | |||
| -------- | |||
| >>> import numpy as np | |||
| >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel | |||
| >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||
| >>> | |||
| >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' | |||
| >>> estimator = untilhpathkernel | |||
| >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: | |||
| [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} | |||
| >>> # ’C’ for classification problems and ’alpha’ for regression problems. | |||
| >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: | |||
| np.logspace(-10, 10, num=41, base=10)}] | |||
| >>> | |||
| >>> model_selection_for_precomputed_kernel(datafile, estimator, | |||
| param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) | |||
| """ | |||
| def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs): | |||
| tqdm.monitor_interval = 0 | |||
| self._ds = dataset | |||
| self._estimator = estimator | |||
| self._num_trials = num_trials | |||
| self._n_jobs = n_jobs | |||
| self._save_gms = save_gms | |||
| self._save_gm_figs = save_gm_figs | |||
| self._logging = logging | |||
| self._verbose = verbose | |||
| self._kwargs = kwargs | |||
| # Set dataset name. | |||
| if self._ds._ds_name is None: | |||
| self._ds_name = 'ds-unknown' | |||
| else: | |||
| self._ds_name = self._ds._ds_name | |||
| # The output directory. | |||
| if output_dir is None: | |||
| self._output_dir = os.path.join('outputs/', estimator.__name__) | |||
| else: | |||
| self._output_dir = output_dir | |||
| os.makedirs(self._output_dir, exist_ok=True) | |||
| # Setup the model type. | |||
| if model_type is None: | |||
| self._model_type = dataset._task_type | |||
| else: | |||
| self._model_type = model_type.lower() | |||
| if self._model_type != 'regression' and self._model_type != 'classification': | |||
| raise Exception('The model type is incorrect! Please choose from regression or classification.') | |||
| # @todo: Set param_grid_precomputed and param_grid. | |||
| self._param_grid_precomputed = param_grid_precomputed | |||
| self._param_grid = param_grid | |||
| if self._verbose: | |||
| print() | |||
| print('--- This is a %s problem ---' % self._model_type) | |||
| # A string to save all the results. | |||
| if self._logging: | |||
| self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||
| self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
| self._str_fw += 'This is a %s problem.\n' % self._model_type | |||
| self.run() | |||
| def run(self): | |||
| self.fit() | |||
| self.compute_gram_matrices() | |||
| if len(self._gram_matrices) == 0: | |||
| if self._verbose: | |||
| print('All gram matrices are ignored, no results obtained.') | |||
| if self._logging: | |||
| self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n' | |||
| else: | |||
| self.do_cv() | |||
| # print out results as table. | |||
| if self._logging: | |||
| self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose) | |||
| # open file to save all results for this dataset. | |||
| if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'): | |||
| with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f: | |||
| f.write(self._str_fw) | |||
| else: | |||
| with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f: | |||
| content = f.read() | |||
| f.seek(0, 0) | |||
| f.write(self._str_fw + '\n\n\n' + content) | |||
| return self._final_performance, self._final_confidence | |||
| def fit(self): | |||
| return | |||
| def compute_gram_matrices(self): | |||
| """Compute all gram matrices. | |||
| Returns | |||
| ------- | |||
| None. | |||
| """ | |||
| # Grid of parameters with a discrete number of values for each. | |||
| self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed)) | |||
| self._param_list = list(ParameterGrid(self._param_grid)) | |||
| self._gram_matrices = [ | |||
| ] # a list to store gram matrices for all param_grid_precomputed | |||
| self._gram_matrix_time = [ | |||
| ] # a list to store time to calculate gram matrices | |||
| self._param_list_pre_revised = [ | |||
| ] # list to store param grids precomputed ignoring the useless ones | |||
| if self._verbose: | |||
| print() | |||
| print('\n1. Computing gram matrices. This could take a while...') | |||
| if self._logging: | |||
| self._str_fw += '\nI. Gram matrices.\n\n' | |||
| self._tts = time.time() # start training time | |||
| nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
| for idx, params_out in enumerate(self._param_list_precomputed): | |||
| y = self._ds.targets[:] | |||
| params_out['n_jobs'] = self._n_jobs | |||
| params_out['verbose'] = self._verbose | |||
| # print(dataset) | |||
| # import networkx as nx | |||
| # nx.draw_networkx(dataset[1]) | |||
| # plt.show() | |||
| rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs. | |||
| Kmatrix = rtn_data[0] | |||
| current_run_time = rtn_data[1] | |||
| # for some kernels, some graphs in datasets may not meet the | |||
| # kernels' requirements for graph structure. These graphs are trimmed. | |||
| if len(rtn_data) == 3: | |||
| idx_trim = rtn_data[2] # the index of trimmed graph list | |||
| y = [y[idxt] for idxt in idx_trim] # trim y accordingly | |||
| # Kmatrix = np.random.rand(2250, 2250) | |||
| # current_run_time = 0.1 | |||
| # remove graphs whose kernels with themselves are zeros | |||
| # @todo: y not changed accordingly? | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| nb_g_ignore = 0 | |||
| for idxk, diag in enumerate(Kmatrix_diag): | |||
| if diag == 0: | |||
| Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) | |||
| Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) | |||
| nb_g_ignore += 1 | |||
| # normalization | |||
| # @todo: works only for undirected graph? | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| for i in range(len(Kmatrix)): | |||
| for j in range(i, len(Kmatrix)): | |||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| if self._verbose: | |||
| print() | |||
| if params_out == {}: | |||
| if self._verbose: | |||
| print('the gram matrix is: ') | |||
| if self._logging: | |||
| self._str_fw += 'the gram matrix is:\n\n' | |||
| else: | |||
| if self._verbose: | |||
| print('the gram matrix with parameters', params_out, 'is: \n\n') | |||
| if self._logging: | |||
| self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
| if len(Kmatrix) < 2: | |||
| nb_gm_ignore += 1 | |||
| if self._verbose: | |||
| print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
| if self._logging: | |||
| self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
| else: | |||
| if np.isnan(Kmatrix).any( | |||
| ): # if the matrix contains elements that are not numbers | |||
| nb_gm_ignore += 1 | |||
| if self._verbose: | |||
| print('ignored, as it contains elements that are not numbers.') | |||
| if self._logging: | |||
| self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
| else: | |||
| # print(Kmatrix) | |||
| if self._logging: | |||
| self._str_fw += np.array2string( | |||
| Kmatrix, | |||
| separator=',') + '\n\n' | |||
| # separator=',', | |||
| # threshold=np.inf, | |||
| # floatmode='unique') + '\n\n' | |||
| # Draw and save Gram matrix figures. | |||
| if self._save_gm_figs: | |||
| fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name | |||
| if params_out != {}: | |||
| fig_file_name += '[params]' + str(idx) | |||
| plt.imshow(Kmatrix) | |||
| plt.colorbar() | |||
| plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||
| # plt.show() | |||
| plt.clf() | |||
| self._gram_matrices.append(Kmatrix) | |||
| self._gram_matrix_time.append(current_run_time) | |||
| self._param_list_pre_revised.append(params_out) | |||
| if nb_g_ignore > 0: | |||
| if self._verbose: | |||
| print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
| if self._logging: | |||
| self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
| if self._verbose: | |||
| print() | |||
| print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore)) | |||
| if self._logging: | |||
| self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore) | |||
| self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
| self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)]) | |||
| def do_cv(self): | |||
| # save gram matrices to file. | |||
| # np.savez(output_dir + '/' + ds_name + '.gm', | |||
| # gms=gram_matrices, params=param_list_pre_revised, y=y, | |||
| # gmtime=gram_matrix_time) | |||
| if self._verbose: | |||
| print('2. Fitting and predicting using nested cross validation. This could really take a while...') | |||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||
| # train_pref = [] | |||
| # val_pref = [] | |||
| # test_pref = [] | |||
| # def func_assign(result, var_to_assign): | |||
| # for idx, itm in enumerate(var_to_assign): | |||
| # itm.append(result[idx]) | |||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) | |||
| # | |||
| # parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, | |||
| # [train_pref, val_pref, test_pref], glbv=gram_matrices, | |||
| # method='imap_unordered', n_jobs=n_jobs, chunksize=1, | |||
| # itr_desc='cross validation') | |||
| def init_worker(gms_toshare): | |||
| global G_gms | |||
| G_gms = gms_toshare | |||
| # gram_matrices = np.array(gram_matrices) | |||
| # gms_shape = gram_matrices.shape | |||
| # gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) | |||
| # pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) | |||
| pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,)) | |||
| trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y. | |||
| train_pref = [] | |||
| val_pref = [] | |||
| test_pref = [] | |||
| # if NUM_TRIALS < 1000 * n_jobs: | |||
| # chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||
| # else: | |||
| # chunksize = 1000 | |||
| chunksize = 1 | |||
| if self._verbose: | |||
| iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout) | |||
| else: | |||
| iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize) | |||
| for o1, o2, o3 in iterator: | |||
| train_pref.append(o1) | |||
| val_pref.append(o2) | |||
| test_pref.append(o3) | |||
| pool.close() | |||
| pool.join() | |||
| # # ---- use pool.map to parallel. ---- | |||
| # pool = Pool(n_jobs) | |||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) | |||
| # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
| # train_pref = [item[0] for item in result_perf] | |||
| # val_pref = [item[1] for item in result_perf] | |||
| # test_pref = [item[2] for item in result_perf] | |||
| # # ---- direct running, normally use a single CPU core. ---- | |||
| # train_pref = [] | |||
| # val_pref = [] | |||
| # test_pref = [] | |||
| # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
| # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
| # train_pref.append(o1) | |||
| # val_pref.append(o2) | |||
| # test_pref.append(o3) | |||
| # print() | |||
| if self._verbose: | |||
| print() | |||
| print('3. Getting final performance...') | |||
| if self._logging: | |||
| self._str_fw += '\nII. Performance.\n\n' | |||
| # averages and confidences of performances on outer trials for each combination of parameters | |||
| self._average_train_scores = np.mean(train_pref, axis=0) | |||
| # print('val_pref: ', val_pref[0][0]) | |||
| self._average_val_scores = np.mean(val_pref, axis=0) | |||
| # print('test_pref: ', test_pref[0][0]) | |||
| self._average_perf_scores = np.mean(test_pref, axis=0) | |||
| # sample std is used here | |||
| self._std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
| self._std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||
| self._std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||
| if self._model_type == 'regression': | |||
| best_val_perf = np.amin(self._average_val_scores) | |||
| else: | |||
| best_val_perf = np.amax(self._average_val_scores) | |||
| # print('average_val_scores: ', self._average_val_scores) | |||
| # print('best_val_perf: ', best_val_perf) | |||
| # print() | |||
| best_params_index = np.where(self._average_val_scores == best_val_perf) | |||
| # find smallest val std with best val perf. | |||
| best_val_stds = [ | |||
| self._std_val_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| min_val_std = np.amin(best_val_stds) | |||
| best_params_index = np.where(self._std_val_scores == min_val_std) | |||
| best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]] | |||
| best_params_in = [self._param_list[i] for i in best_params_index[1]] | |||
| if self._verbose: | |||
| print('best_params_out: ', best_params_out) | |||
| print('best_params_in: ', best_params_in) | |||
| print() | |||
| print('best_val_perf: ', best_val_perf) | |||
| print('best_val_std: ', min_val_std) | |||
| if self._logging: | |||
| self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||
| self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||
| self._str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
| self._str_fw += 'best_val_std: %s\n' % min_val_std | |||
| # print(best_params_index) | |||
| # print(best_params_index[0]) | |||
| # print(self._average_perf_scores) | |||
| self._final_performance = [ | |||
| self._average_perf_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| self._final_confidence = [ | |||
| self._std_perf_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| if self._verbose: | |||
| print('final_performance: ', self._final_performance) | |||
| print('final_confidence: ', self._final_confidence) | |||
| if self._logging: | |||
| self._str_fw += 'final_performance: %s\n' % self._final_performance | |||
| self._str_fw += 'final_confidence: %s\n' % self._final_confidence | |||
| train_performance = [ | |||
| self._average_train_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| train_std = [ | |||
| self._std_train_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| if self._verbose: | |||
| print('train_performance: %s' % train_performance) | |||
| print('train_std: ', train_std) | |||
| if self._logging: | |||
| self._str_fw += 'train_performance: %s\n' % train_performance | |||
| self._str_fw += 'train_std: %s\n\n' % train_std | |||
| if self._verbose: | |||
| print() | |||
| tt_total = time.time() - self._tts # training time for all hyper-parameters | |||
| average_gram_matrix_time = np.mean(self._gram_matrix_time) | |||
| std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0 | |||
| best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]] | |||
| ave_bgmt = np.mean(best_gram_matrix_time) | |||
| std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 | |||
| if self._verbose: | |||
| print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
| .format(average_gram_matrix_time, std_gram_matrix_time)) | |||
| print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
| ave_bgmt, std_bgmt)) | |||
| print('total training time with all hyper-param choices: {:.2f}s'.format( | |||
| tt_total)) | |||
| if self._logging: | |||
| self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
| self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
| self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
| # # save results to file | |||
| # np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
| # average_train_scores) | |||
| # np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores) | |||
| # np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||
| # average_perf_scores) | |||
| # np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores) | |||
| # np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores) | |||
| # np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores) | |||
| # np.save(results_name_pre + 'best_params_index', best_params_index) | |||
| # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
| # np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
| # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||
| # np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||
| # np.save(results_name_pre + 'final_performance.dt', self._final_performance) | |||
| # np.save(results_name_pre + 'final_confidence.dt', self._final_confidence) | |||
| # np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
| # np.save(results_name_pre + 'train_std.dt', train_std) | |||
| # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
| # np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
| # average_gram_matrix_time) | |||
| # np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||
| # std_gram_matrix_time) | |||
| # np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
| # best_gram_matrix_time) | |||
| def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||
| # # get gram matrices from global variables. | |||
| # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') | |||
| # Arrays to store scores | |||
| train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
| val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
| test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
| # randomness added to seeds of split function below. "high" is "size" times | |||
| # 10 so that at least 10 different random output will be yielded. Remove | |||
| # these lines if identical outputs is required. | |||
| rdm_out = np.random.RandomState(seed=None) | |||
| rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||
| size=len(param_list_pre_revised)) | |||
| # print(trial, rdm_seed_out_l) | |||
| # print() | |||
| # loop for each outer param tuple | |||
| for index_out, params_out in enumerate(param_list_pre_revised): | |||
| # get gram matrices from global variables. | |||
| # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] | |||
| # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') | |||
| gm_now = gram_matrices[index_out].copy() | |||
| # split gram matrix and y to app and test sets. | |||
| indices = range(len(y)) | |||
| # The argument "random_state" in function "train_test_split" can not be | |||
| # set to None, because it will use RandomState instance used by | |||
| # np.random, which is possible for multiple subprocesses to inherit the | |||
| # same seed if they forked at the same time, leading to identical | |||
| # random variates for different subprocesses. Instead, we use "trial" | |||
| # and "index_out" parameters to generate different seeds for different | |||
| # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||
| # randomness into seeds, so that it yields a different output every | |||
| # time the program is run. To yield identical outputs every time, | |||
| # remove the second line below. Same method is used to the "KFold" | |||
| # function in the inner loop. | |||
| rdm_seed_out = (trial + 1) * (index_out + 1) | |||
| rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||
| # print(trial, rdm_seed_out) | |||
| X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||
| gm_now, y, indices, test_size=0.1, | |||
| random_state=rdm_seed_out, shuffle=True) | |||
| # print(trial, idx_app, idx_test) | |||
| # print() | |||
| X_app = X_app[:, idx_app] | |||
| X_test = X_test[:, idx_app] | |||
| y_app = np.array(y_app) | |||
| y_test = np.array(y_test) | |||
| rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||
| size=len(param_list)) | |||
| # loop for each inner param tuple | |||
| for index_in, params_in in enumerate(param_list): | |||
| # if trial == 0: | |||
| # print(index_out, index_in) | |||
| # print('params_in: ', params_in) | |||
| # st = time.time() | |||
| rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||
| # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||
| rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||
| # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||
| inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||
| current_train_perf = [] | |||
| current_valid_perf = [] | |||
| current_test_perf = [] | |||
| # For regression use the Kernel Ridge method | |||
| # try: | |||
| if self._model_type == 'regression': | |||
| kr = KernelRidge(kernel='precomputed', **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split(X_app): | |||
| # print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||
| # if trial == 0: | |||
| # print('train_index: ', train_index) | |||
| # print('valid_index: ', valid_index) | |||
| # print('idx_test: ', idx_test) | |||
| # print('y_app[train_index]: ', y_app[train_index]) | |||
| # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
| # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
| kr.fit(X_app[train_index, :][:, train_index], | |||
| y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = kr.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = kr.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| # if trial == 0: | |||
| # print('y_pred_valid: ', y_pred_valid) | |||
| # print() | |||
| y_pred_test = kr.predict( | |||
| X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_app[train_index], y_pred_train))) | |||
| current_valid_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_app[valid_index], y_pred_valid))) | |||
| # if trial == 0: | |||
| # print(mean_squared_error( | |||
| # y_app[valid_index], y_pred_valid)) | |||
| current_test_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_test, y_pred_test))) | |||
| # For clcassification use SVM | |||
| else: | |||
| svc = SVC(kernel='precomputed', cache_size=200, | |||
| verbose=False, **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split(X_app): | |||
| # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||
| # if trial == 0: | |||
| # print('train_index: ', train_index) | |||
| # print('valid_index: ', valid_index) | |||
| # print('idx_test: ', idx_test) | |||
| # print('y_app[train_index]: ', y_app[train_index]) | |||
| # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
| # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
| svc.fit(X_app[train_index, :][:, train_index], | |||
| y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = svc.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = svc.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| y_pred_test = svc.predict( | |||
| X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append( | |||
| accuracy_score(y_app[train_index], | |||
| y_pred_train)) | |||
| current_valid_perf.append( | |||
| accuracy_score(y_app[valid_index], | |||
| y_pred_valid)) | |||
| current_test_perf.append( | |||
| accuracy_score(y_test, y_pred_test)) | |||
| # except ValueError: | |||
| # print(sys.exc_info()[0]) | |||
| # print(params_out, params_in) | |||
| # average performance on inner splits | |||
| train_pref[index_out][index_in] = np.mean( | |||
| current_train_perf) | |||
| val_pref[index_out][index_in] = np.mean( | |||
| current_valid_perf) | |||
| test_pref[index_out][index_in] = np.mean( | |||
| current_test_perf) | |||
| # print(time.time() - st) | |||
| # if trial == 0: | |||
| # print('val_pref: ', val_pref) | |||
| # print('test_pref: ', test_pref) | |||
| return train_pref, val_pref, test_pref | |||
| def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial): | |||
| train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised, | |||
| param_list, G_gms, y, | |||
| model_type, trial) | |||
| return train_pref, val_pref, test_pref | |||
| def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores, | |||
| std_val_scores, average_perf_scores, std_perf_scores, | |||
| average_train_scores, std_train_scores, gram_matrix_time, | |||
| model_type, verbose): | |||
| from collections import OrderedDict | |||
| from tabulate import tabulate | |||
| table_dict = {} | |||
| if model_type == 'regression': | |||
| for param_in in param_list: | |||
| param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
| else: | |||
| for param_in in param_list: | |||
| param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
| table_dict['params'] = [{**param_out, **param_in} | |||
| for param_in in param_list for param_out in param_list_pre_revised] | |||
| table_dict['gram_matrix_time'] = [ | |||
| '{:.2f}'.format(gram_matrix_time[index_out]) | |||
| for param_in in param_list | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['valid_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
| std_val_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['test_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
| std_perf_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['train_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
| std_train_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| keyorder = [ | |||
| 'params', 'train_perf', 'valid_perf', 'test_perf', | |||
| 'gram_matrix_time' | |||
| ] | |||
| if verbose: | |||
| print() | |||
| tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||
| key=lambda i: keyorder.index(i[0]))), headers='keys') | |||
| # print(tb_print) | |||
| return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
| @@ -0,0 +1,89 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri May 21 12:18:02 2021 | |||
| @author: ljia | |||
| """ | |||
| def dichotomous_permutation(arr, layer=0): | |||
| import math | |||
| # def seperate_arr(arr, new_arr): | |||
| # if (length % 2) == 0: | |||
| # half = int(length / 2) | |||
| # new_arr += [arr[half - 1], arr[half]] | |||
| # subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| # else: | |||
| # half = math.floor(length / 2) | |||
| # new_arr.append(arr[half]) | |||
| # subarr1 = [arr[i] for i in range(1, half)] | |||
| # subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| # subarrs = [subarr1, subarr2] | |||
| # return subarrs | |||
| if layer == 0: | |||
| length = len(arr) | |||
| if length <= 2: | |||
| return arr | |||
| new_arr = [arr[0], arr[-1]] | |||
| if (length % 2) == 0: | |||
| half = int(length / 2) | |||
| new_arr += [arr[half - 1], arr[half]] | |||
| subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| else: | |||
| half = math.floor(length / 2) | |||
| new_arr.append(arr[half]) | |||
| subarr1 = [arr[i] for i in range(1, half)] | |||
| subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| subarrs = [subarr1, subarr2] | |||
| # subarrs = seperate_arr(arr, new_arr) | |||
| new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
| else: | |||
| new_arr = [] | |||
| subarrs = [] | |||
| for a in arr: | |||
| length = len(a) | |||
| if length <= 2: | |||
| new_arr += a | |||
| else: | |||
| # subarrs += seperate_arr(a, new_arr) | |||
| if (length % 2) == 0: | |||
| half = int(length / 2) | |||
| new_arr += [a[half - 1], a[half]] | |||
| subarr1 = [a[i] for i in range(0, half - 1)] | |||
| else: | |||
| half = math.floor(length / 2) | |||
| new_arr.append(a[half]) | |||
| subarr1 = [a[i] for i in range(0, half)] | |||
| subarr2 = [a[i] for i in range(half + 1, length)] | |||
| subarrs += [subarr1, subarr2] | |||
| if len(subarrs) > 0: | |||
| new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
| return new_arr | |||
| # length = len(arr) | |||
| # if length <= 2: | |||
| # return arr | |||
| # new_arr = [arr[0], arr[-1]] | |||
| # if (length % 2) == 0: | |||
| # half = int(length / 2) | |||
| # new_arr += [arr[half - 1], arr[half]] | |||
| # subarr1 = [arr[i] for i in range(1, half - 1)] | |||
| # else: | |||
| # half = math.floor(length / 2) | |||
| # new_arr.append(arr[half]) | |||
| # subarr1 = [arr[i] for i in range(1, half)] | |||
| # subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
| # if len(subarr1) > 0: | |||
| # new_arr += dichotomous_permutation(subarr1) | |||
| # if len(subarr2) > 0: | |||
| # new_arr += dichotomous_permutation(subarr2) | |||
| # return new_arr | |||
| @@ -0,0 +1,109 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Nov 27 19:33:51 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import numpy as np | |||
| import pickle | |||
| from gklearn.dataset import Dataset | |||
| from gklearn.model_learning import NestedCV | |||
| from gklearn.kernels import GRAPH_KERNELS | |||
| class Workflow(object): | |||
| def __init__(self, **kwargs): | |||
| self._job_prefix = kwargs.get('job_prefix', 'gktask') | |||
| self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf) | |||
| self._root_dir = kwargs.get('root_dir', 'outputs/') | |||
| def run(self, tasks): | |||
| ### Check inputs. | |||
| if self._check_inputs(tasks): | |||
| self._tasks = tasks | |||
| else: | |||
| raise ValueError('The input "tasks" is not correct.') | |||
| ### Sort tasks. | |||
| self.sort_tasks_by_complexity() | |||
| ### The main process. | |||
| complete = False | |||
| while not complete: | |||
| self.get_running_tasks() | |||
| if self._num_running_tasks < self._max_num_running_tasks: | |||
| ### Load results from table. | |||
| self.load_results_from_table() | |||
| for task in self._tasks: | |||
| state = self.get_task_state(task) | |||
| if state != 'complete' and state != 'runnning': | |||
| self.run_task(task) | |||
| if self._num_running_tasks >= self._max_num_running_tasks: | |||
| break | |||
| ### Save results. | |||
| self.save_results() | |||
| complete = self.check_completeness() | |||
| # sleep() | |||
| def _check_inputs(self, tasks): | |||
| if not isinstance(tasks, list): | |||
| return False | |||
| else: | |||
| for i in tasks: | |||
| if not 'kernel' in i or not 'dataset' in i: | |||
| return False | |||
| return True | |||
| def sort_tasks_by_complexity(self): | |||
| return | |||
| def get_running_tasks(self): | |||
| command = 'squeue --user $USER --format "%.50j" --noheader' | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)] | |||
| self._num_running_tasks = len(running_tasks) | |||
| def load_results_from_table(self): | |||
| pass | |||
| def get_task_state(self, task): | |||
| task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/') | |||
| fn_summary = os.path.join(task_dir, 'results_summary.pkl') | |||
| if os.path.isfile(fn_summary): | |||
| output = pickle.loads(fn_summary) | |||
| state = output['state'] | |||
| return state | |||
| else: | |||
| return 'unstarted' | |||
| def run_task(self, task): | |||
| ds_name = task['dataset'] | |||
| k_name = task['kernel'] | |||
| # Get dataset. | |||
| ds = Dataset(ds_name) | |||
| graph_kernel = GRAPH_KERNELS[k_name] | |||
| # Start CV. | |||
| results = NestedCV(ds, graph_kernel) | |||
| @@ -25,34 +25,40 @@ def chooseDataset(ds_name): | |||
| current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
| root = current_path + '../../datasets/' | |||
| # no node labels (and no edge labels). | |||
| if ds_name == 'Alkane': | |||
| # no labels at all. | |||
| if ds_name == 'Alkane_unlabeled': | |||
| dataset = Dataset('Alkane_unlabeled', root=root) | |||
| dataset.trim_dataset(edge_required=False) | |||
| dataset.cut_graphs(range(1, 10)) | |||
| # node symbolic labels. | |||
| # node symbolic labels only. | |||
| elif ds_name == 'Acyclic': | |||
| dataset = Dataset('Acyclic', root=root) | |||
| dataset.trim_dataset(edge_required=False) | |||
| # node non-symbolic labels. | |||
| # node non-symbolic labels only. | |||
| elif ds_name == 'Letter-med': | |||
| dataset = Dataset('Letter-med', root=root) | |||
| dataset.trim_dataset(edge_required=False) | |||
| # node symbolic and non-symbolic labels (and edge symbolic labels). | |||
| # node symbolic + non-symbolic labels + edge symbolic labels. | |||
| elif ds_name == 'AIDS': | |||
| dataset = Dataset('AIDS', root=root) | |||
| dataset.trim_dataset(edge_required=False) | |||
| # edge non-symbolic labels (no node labels). | |||
| elif ds_name == 'Fingerprint_edge': | |||
| # node non-symbolic labels + edge non-symbolic labels. | |||
| elif ds_name == 'Fingerprint': | |||
| dataset = Dataset('Fingerprint', root=root) | |||
| dataset.trim_dataset(edge_required=True) | |||
| irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||
| # edge symbolic only. | |||
| elif ds_name == 'MAO': | |||
| dataset = Dataset('MAO', root=root) | |||
| dataset.trim_dataset(edge_required=True) | |||
| irrelevant_labels = {'node_labels': ['atom_symbol'], 'node_attrs': ['x', 'y']} | |||
| dataset.remove_labels(**irrelevant_labels) | |||
| # edge non-symbolic labels (and node non-symbolic labels). | |||
| elif ds_name == 'Fingerprint': | |||
| # edge non-symbolic labels only. | |||
| elif ds_name == 'Fingerprint_edge': | |||
| dataset = Dataset('Fingerprint', root=root) | |||
| dataset.trim_dataset(edge_required=True) | |||
| # edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels). | |||
| irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||
| dataset.remove_labels(**irrelevant_labels) | |||
| # node symbolic and non-symbolic labels + edge symbolic and non-symbolic labels. | |||
| elif ds_name == 'Cuneiform': | |||
| dataset = Dataset('Cuneiform', root=root) | |||
| dataset.trim_dataset(edge_required=True) | |||
| @@ -91,7 +97,7 @@ def assert_equality(compute_fun, **kwargs): | |||
| assert np.array_equal(lst[i], lst[i + 1]) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
| @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| def test_CommonWalk(ds_name, weight, compute_method): | |||
| @@ -126,7 +132,7 @@ def test_CommonWalk(ds_name, weight, compute_method): | |||
| assert_equality(compute, parallel=['imap_unordered', None]) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
| @pytest.mark.parametrize('remove_totters', [False]) #[True, False]) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| def test_Marginalized(ds_name, remove_totters): | |||
| @@ -319,13 +325,13 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||
| # @pytest.mark.parametrize( | |||
| # 'compute_method,ds_name,sub_kernel', | |||
| # [ | |||
| # ('sylvester', 'Alkane', None), | |||
| # ('conjugate', 'Alkane', None), | |||
| # ('sylvester', 'Alkane_unlabeled', None), | |||
| # ('conjugate', 'Alkane_unlabeled', None), | |||
| # ('conjugate', 'AIDS', None), | |||
| # ('fp', 'Alkane', None), | |||
| # ('fp', 'Alkane_unlabeled', None), | |||
| # ('fp', 'AIDS', None), | |||
| # ('spectral', 'Alkane', 'exp'), | |||
| # ('spectral', 'Alkane', 'geo'), | |||
| # ('spectral', 'Alkane_unlabeled', 'exp'), | |||
| # ('spectral', 'Alkane_unlabeled', 'geo'), | |||
| # ] | |||
| # ) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| @@ -365,7 +371,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||
| # assert False, exception | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| def test_ShortestPath(ds_name): | |||
| """Test shortest path kernel. | |||
| @@ -401,8 +407,8 @@ def test_ShortestPath(ds_name): | |||
| assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | |||
| #@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||
| #@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| def test_StructuralSP(ds_name): | |||
| """Test structural shortest path kernel. | |||
| @@ -441,7 +447,7 @@ def test_StructuralSP(ds_name): | |||
| assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| #@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) | |||
| @pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) | |||
| @@ -476,7 +482,7 @@ def test_PathUpToH(ds_name, k_func): | |||
| compute_method=['trie', 'naive']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| def test_Treelet(ds_name): | |||
| """Test treelet kernel. | |||
| @@ -510,7 +516,7 @@ def test_Treelet(ds_name): | |||
| assert_equality(compute, parallel=['imap_unordered', None]) | |||
| @pytest.mark.parametrize('ds_name', ['Acyclic']) | |||
| @pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'MAO', 'AIDS']) | |||
| #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) | |||
| # @pytest.mark.parametrize('base_kernel', ['subtree']) | |||
| # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| @@ -540,17 +546,17 @@ def test_WLSubtree(ds_name): | |||
| else: | |||
| return gram_matrix, kernel_list, kernel | |||
| assert_equality(compute, parallel=['imap_unordered', None]) | |||
| assert_equality(compute, parallel=[None, 'imap_unordered']) | |||
| if __name__ == "__main__": | |||
| test_list_graph_kernels() | |||
| # test_spkernel('Alkane', 'imap_unordered') | |||
| # test_ShortestPath('Alkane') | |||
| # test_list_graph_kernels() | |||
| # test_spkernel('Alkane_unlabeled', 'imap_unordered') | |||
| # test_ShortestPath('Alkane_unlabeled') | |||
| # test_StructuralSP('Fingerprint_edge', 'imap_unordered') | |||
| # test_StructuralSP('Acyclic') | |||
| # test_StructuralSP('Cuneiform', None) | |||
| # test_WLSubtree('Acyclic') | |||
| test_WLSubtree('MAO') # 'Alkane_unlabeled', 'Acyclic', 'AIDS' | |||
| # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'fp', None, None) | |||
| @@ -559,7 +565,7 @@ if __name__ == "__main__": | |||
| # test_Marginalized('Acyclic', False) | |||
| # test_ShortestPath('Acyclic') | |||
| # test_PathUpToH('Acyclic', 'MinMax') | |||
| # test_Treelet('Acyclic') | |||
| # test_Treelet('AIDS') | |||
| # test_SylvesterEquation('Acyclic') | |||
| # test_ConjugateGradient('Acyclic') | |||
| # test_FixedPoint('Acyclic') | |||
| @@ -3,156 +3,230 @@ These kernels are defined between pairs of vectors. | |||
| """ | |||
| import numpy as np | |||
| def delta_kernel(x, y): | |||
| """Delta kernel. Return 1 if x == y, 0 otherwise. | |||
| Parameters | |||
| ---------- | |||
| x, y : any | |||
| Two parts to compare. | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| Delta kernel. | |||
| References | |||
| ---------- | |||
| [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
| labeled graphs. In Proceedings of the 20th International Conference on | |||
| Machine Learning, Washington, DC, United States, 2003. | |||
| """ | |||
| return x == y #(1 if condition else 0) | |||
| def deltakernel(x, y): | |||
| """Delta kernel. Return 1 if x == y, 0 otherwise. | |||
| return delta_kernel(x, y) | |||
| def gaussian_kernel(x, y, gamma=None): | |||
| """Gaussian kernel. | |||
| Compute the rbf (gaussian) kernel between x and y: | |||
| Parameters | |||
| ---------- | |||
| x, y : any | |||
| Two parts to compare. | |||
| K(x, y) = exp(-gamma ||x-y||^2). | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| Delta kernel. | |||
| Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||
| References | |||
| ---------- | |||
| [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
| labeled graphs. In Proceedings of the 20th International Conference on | |||
| Machine Learning, Washington, DC, United States, 2003. | |||
| """ | |||
| return x == y #(1 if condition else 0) | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| gamma : float, default None | |||
| If None, defaults to 1.0 / n_features | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| if gamma is None: | |||
| gamma = 1.0 / len(x) | |||
| # xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||
| # yt = np.array([float(itm) for itm in y]) | |||
| # kernel = xt - yt | |||
| # kernel = kernel ** 2 | |||
| # kernel = np.sum(kernel) | |||
| # kernel *= -gamma | |||
| # kernel = np.exp(kernel) | |||
| # return kernel | |||
| return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) | |||
| def gaussiankernel(x, y, gamma=None): | |||
| """Gaussian kernel. | |||
| Compute the rbf (gaussian) kernel between x and y: | |||
| return gaussian_kernel(x, y, gamma=gamma) | |||
| K(x, y) = exp(-gamma ||x-y||^2). | |||
| Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||
| def polynomial_kernel(x, y, gamma=1, coef0=0, d=1): | |||
| return (np.dot(x, y) * gamma + coef0) ** d | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| gamma : float, default None | |||
| If None, defaults to 1.0 / n_features | |||
| def highest_polynomial_kernel(x, y, d=1, c=0): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| if gamma is None: | |||
| gamma = 1.0 / len(x) | |||
| K(x, y) = <x, y> ^d + c. | |||
| xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||
| yt = np.array([float(itm) for itm in y]) | |||
| kernel = xt - yt | |||
| kernel = kernel ** 2 | |||
| kernel = np.sum(kernel) | |||
| kernel *= -gamma | |||
| kernel = np.exp(kernel) | |||
| return kernel | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| d : integer, default 1 | |||
| c : float, default 0 | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| return np.dot(x, y) ** d + c | |||
| def polynomialkernel(x, y, d=1, c=0): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| return highest_polynomial_kernel(x, y, d=d, c=c) | |||
| K(x, y) = <x, y> ^d + c. | |||
| def linear_kernel(x, y): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| K(x, y) = <x, y>. | |||
| d : integer, default 1 | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| c : float, default 0 | |||
| d : integer, default 1 | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| return np.dot(x, y) ** d + c | |||
| c : float, default 0 | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| return np.dot(x, y) | |||
| def linearkernel(x, y): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| return linear_kernel(x, y) | |||
| def cosine_kernel(x, y): | |||
| return np.dot(x, y) / (np.abs(x) * np.abs(y)) | |||
| def sigmoid_kernel(x, y, gamma=None, coef0=1): | |||
| if gamma is None: | |||
| gamma = 1.0 / len(x) | |||
| k = np.dot(x, y) | |||
| k *= gamma | |||
| k += coef0 | |||
| k = np.tanh(k) | |||
| # k = np.tanh(k, k) # compute tanh in-place | |||
| return k | |||
| def laplacian_kernel(x, y, gamma=None): | |||
| if gamma is None: | |||
| gamma = 1.0 / len(x) | |||
| k = -gamma * np.abs(np.subtract(x, y)) | |||
| k = np.exp(k) | |||
| return k | |||
| def chi2_kernel(x, y, gamma=1.0): | |||
| k = np.divide(np.subtract(x, y) ** 2, np.add(x, y)) | |||
| k = np.sum(k) | |||
| k *= -gamma | |||
| return np.exp(k) | |||
| def exponential_kernel(x, y, gamma=None): | |||
| if gamma is None: | |||
| gamma = 1.0 / len(x) | |||
| return np.exp(np.dot(x, y) * gamma) | |||
| K(x, y) = <x, y>. | |||
| def intersection_kernel(x, y): | |||
| return np.sum(np.minimum(x, y)) | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| d : integer, default 1 | |||
| def multiquadratic_kernel(x, y, c=0): | |||
| return np.sqrt((np.sum(np.subtract(x, y) ** 2)) + c) | |||
| c : float, default 0 | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| return np.dot(x, y) | |||
| def inverse_multiquadratic_kernel(x, y, c=0): | |||
| return 1 / multiquadratic_kernel(x, y, c=c) | |||
| def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | |||
| """Sum of a pair of kernels. | |||
| """Sum of a pair of kernels. | |||
| k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
| k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
| Parameters | |||
| ---------- | |||
| k1, k2 : function | |||
| A pair of kernel functions. | |||
| d11, d12: | |||
| Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
| d21, d22: | |||
| Inputs of k2. | |||
| lamda1, lamda2: float | |||
| Coefficients of the product. | |||
| Parameters | |||
| ---------- | |||
| k1, k2 : function | |||
| A pair of kernel functions. | |||
| d11, d12: | |||
| Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
| d21, d22: | |||
| Inputs of k2. | |||
| lamda1, lamda2: float | |||
| Coefficients of the product. | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| """ | |||
| if d21 == None or d22 == None: | |||
| kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||
| else: | |||
| kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
| return kernel | |||
| """ | |||
| if d21 == None or d22 == None: | |||
| kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||
| else: | |||
| kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
| return kernel | |||
| def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | |||
| """Product of a pair of kernels. | |||
| k = lamda * k1(d11, d12) * k2(d21, d22) | |||
| Parameters | |||
| ---------- | |||
| k1, k2 : function | |||
| A pair of kernel functions. | |||
| d11, d12: | |||
| Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
| d21, d22: | |||
| Inputs of k2. | |||
| lamda: float | |||
| Coefficient of the product. | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| """ | |||
| if d21 == None or d22 == None: | |||
| kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||
| else: | |||
| kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||
| return kernel | |||
| """Product of a pair of kernels. | |||
| k = lamda * k1(d11, d12) * k2(d21, d22) | |||
| Parameters | |||
| ---------- | |||
| k1, k2 : function | |||
| A pair of kernel functions. | |||
| d11, d12: | |||
| Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
| d21, d22: | |||
| Inputs of k2. | |||
| lamda: float | |||
| Coefficient of the product. | |||
| Return | |||
| ------ | |||
| kernel : integer | |||
| """ | |||
| if d21 == None or d22 == None: | |||
| kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||
| else: | |||
| kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||
| return kernel | |||
| if __name__ == '__main__': | |||
| o = polynomialkernel([1, 2], [3, 4], 2, 3) | |||
| o = polynomialkernel([1, 2], [3, 4], 2, 3) | |||
| @@ -366,19 +366,62 @@ def get_edge_labels(Gn, edge_label): | |||
| def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | |||
| if len(kwargs) != 0: | |||
| kernel_options = kwargs | |||
| if name == 'Marginalized': | |||
| if name == 'CommonWalk' or name == 'common walk': | |||
| from gklearn.kernels import CommonWalk | |||
| graph_kernel = CommonWalk(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'Marginalized' or name == 'marginalized': | |||
| from gklearn.kernels import Marginalized | |||
| graph_kernel = Marginalized(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'ShortestPath': | |||
| elif name == 'SylvesterEquation' or name == 'sylvester equation': | |||
| from gklearn.kernels import SylvesterEquation | |||
| graph_kernel = SylvesterEquation( | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'FixedPoint' or name == 'fixed point': | |||
| from gklearn.kernels import FixedPoint | |||
| graph_kernel = FixedPoint(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| node_attrs=node_attrs, | |||
| edge_attrs=edge_attrs, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'ConjugateGradient' or name == 'conjugate gradient': | |||
| from gklearn.kernels import ConjugateGradient | |||
| graph_kernel = ConjugateGradient(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| node_attrs=node_attrs, | |||
| edge_attrs=edge_attrs, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'SpectralDecomposition' or name == 'spectral decomposition': | |||
| from gklearn.kernels import SpectralDecomposition | |||
| graph_kernel = SpectralDecomposition(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| node_attrs=node_attrs, | |||
| edge_attrs=edge_attrs, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'ShortestPath' or name == 'shortest path': | |||
| from gklearn.kernels import ShortestPath | |||
| graph_kernel = ShortestPath(node_labels=node_labels, | |||
| node_attrs=node_attrs, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'StructuralSP': | |||
| elif name == 'StructuralSP' or name == 'structural shortest path': | |||
| from gklearn.kernels import StructuralSP | |||
| graph_kernel = StructuralSP(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| @@ -386,25 +429,29 @@ def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attr | |||
| edge_attrs=edge_attrs, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'PathUpToH': | |||
| elif name == 'PathUpToH' or name == 'path up to length h': | |||
| from gklearn.kernels import PathUpToH | |||
| graph_kernel = PathUpToH(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'Treelet': | |||
| elif name == 'Treelet' or name == 'treelet': | |||
| from gklearn.kernels import Treelet | |||
| graph_kernel = Treelet(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'WLSubtree': | |||
| elif name == 'WLSubtree' or name == 'weisfeiler-lehman subtree': | |||
| from gklearn.kernels import WLSubtree | |||
| graph_kernel = WLSubtree(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| ds_infos=ds_infos, | |||
| **kernel_options) | |||
| elif name == 'WeisfeilerLehman': | |||
| elif name == 'WeisfeilerLehman' or name == 'weisfeiler-lehman': | |||
| from gklearn.kernels import WeisfeilerLehman | |||
| graph_kernel = WeisfeilerLehman(node_labels=node_labels, | |||
| edge_labels=edge_labels, | |||
| @@ -541,10 +588,18 @@ def get_mlti_dim_edge_attrs(G, attr_names): | |||
| def normalize_gram_matrix(gram_matrix): | |||
| diag = gram_matrix.diagonal().copy() | |||
| old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
| for i in range(len(gram_matrix)): | |||
| for j in range(i, len(gram_matrix)): | |||
| gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| try: | |||
| gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||
| except: | |||
| # rollback() | |||
| np.seterr(**old_settings) | |||
| raise | |||
| else: | |||
| gram_matrix[j][i] = gram_matrix[i][j] | |||
| np.seterr(**old_settings) | |||
| return gram_matrix | |||