| @@ -4,6 +4,8 @@ python: | |||||
| - '3.6' | - '3.6' | ||||
| - '3.7' | - '3.7' | ||||
| - '3.8' | - '3.8' | ||||
| - '3.9' | |||||
| #- '3.10' | |||||
| before_install: | before_install: | ||||
| - python --version | - python --version | ||||
| @@ -1,5 +1,6 @@ | |||||
| # graphkit-learn | # graphkit-learn | ||||
| [](https://travis-ci.com/jajupmochi/graphkit-learn) | |||||
| [](https://app.travis-ci.com/jajupmochi/graphkit-learn) | |||||
| [](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | [](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | ||||
| [](https://codecov.io/gh/jajupmochi/graphkit-learn) | [](https://codecov.io/gh/jajupmochi/graphkit-learn) | ||||
| [](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | [](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | ||||
| @@ -1,147 +0,0 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Nov 2 16:17:01 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| from group_results import group_trials | |||||
| def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| # Return if the file exists. | |||||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
| return None, None | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': max_num_solutions, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||||
| # Return if the group file exists. | |||||
| name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
| if os.path.isfile(name_group): | |||||
| return | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| # Group trials and Remove single files. | |||||
| name_prefix = 'ged_matrix' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False) | |||||
| name_prefix = 'runtime' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for max_num_solutions in mnum_solutions_list: | |||||
| print() | |||||
| print('Max # of solutions:', max_num_solutions) | |||||
| for ratio in ratio_list: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||||
| def get_param_lists(ds_name): | |||||
| if ds_name == 'AIDS_symb': | |||||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| else: | |||||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| return mnum_solutions_list, ratio_list | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| mnum_solutions_list, ratio_list = get_param_lists(ds_name) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -13,7 +13,7 @@ import pickle | |||||
| import logging | import logging | ||||
| from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
| import time | import time | ||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation | |||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids | |||||
| import sys | import sys | ||||
| from group_results import group_trials, check_group_existence, update_group_marker | from group_results import group_trials, check_group_existence, update_group_marker | ||||
| @@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'): | |||||
| elif mode == 'simple': | elif mode == 'simple': | ||||
| from sklearn.model_selection import ParameterGrid | from sklearn.model_selection import ParameterGrid | ||||
| param_grid = ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) | |||||
| param_grid = mix_param_grids([list(ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), | |||||
| list(ParameterGrid([ | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) | |||||
| # print(list(param_grid)) | # print(list(param_grid)) | ||||
| if ds_name == 'AIDS_symb': | if ds_name == 'AIDS_symb': | ||||
| @@ -148,7 +149,7 @@ if __name__ == '__main__': | |||||
| # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | ||||
| # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
| save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | |||||
| save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/' | |||||
| os.makedirs(save_dir, exist_ok=True) | os.makedirs(save_dir, exist_ok=True) | ||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | os.makedirs(save_dir + 'groups/', exist_ok=True) | ||||
| @@ -0,0 +1,172 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Nov 2 16:17:01 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import time | |||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids | |||||
| import sys | |||||
| from group_results import group_trials, check_group_existence, update_group_marker | |||||
| def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| # Return if the file exists. | |||||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
| return None, None | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': 1, # @ max_num_solutions, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = set_edit_cost_consts(ratio, | |||||
| node_labeled=len(dataset.node_labels), | |||||
| edge_labeled=len(dataset.edge_labels), | |||||
| mode='uniform') | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, | |||||
| options=options, | |||||
| repeats=num_solutions, | |||||
| permute_nodes=True, | |||||
| random_state=None, | |||||
| parallel=parallel, | |||||
| verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
| # Return if the group file exists. | |||||
| name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
| if check_group_existence(name_group): | |||||
| return | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| num_trials = 100 | |||||
| for trial in range(1, num_trials + 1): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| # Group trials and remove single files. | |||||
| # @todo: if the program stops between the following lines, then there may be errors. | |||||
| name_prefix = 'ged_matrix' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||||
| name_prefix = 'runtime' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||||
| update_group_marker(name_group) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for params in list(param_grid): | |||||
| print() | |||||
| print(params) | |||||
| save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) | |||||
| def get_param_lists(ds_name, mode='test'): | |||||
| if mode == 'test': | |||||
| num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] | |||||
| ratio_list = [10] | |||||
| return num_solutions_list, ratio_list | |||||
| elif mode == 'simple': | |||||
| from sklearn.model_selection import ParameterGrid | |||||
| param_grid = mix_param_grids([list(ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), | |||||
| list(ParameterGrid([ | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) | |||||
| # print(list(param_grid)) | |||||
| if ds_name == 'AIDS_symb': | |||||
| num_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| else: | |||||
| num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | |||||
| return param_grid | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||||
| # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | |||||
| # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/CRIANN/edit_costs.real_data.nums_sols.ratios.bipartite/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| param_grid = get_param_lists(ds_name, mode='simple') | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -32,6 +32,7 @@ def check_group_existence(file_name): | |||||
| def update_group_marker(file_name): | def update_group_marker(file_name): | ||||
| # @todo: possible error when seveal tasks are using this file at the same time. | |||||
| path, name = os.path.split(file_name) | path, name = os.path.split(file_name) | ||||
| marker_fn = os.path.join(path, 'group_names_finished.pkl') | marker_fn = os.path.join(path, 'group_names_finished.pkl') | ||||
| if os.path.isfile(marker_fn): | if os.path.isfile(marker_fn): | ||||
| @@ -9,36 +9,45 @@ import os | |||||
| import re | import re | ||||
| cur_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| def get_job_script(arg): | def get_job_script(arg): | ||||
| script = r""" | script = r""" | ||||
| #!/bin/bash | #!/bin/bash | ||||
| #SBATCH --exclusive | #SBATCH --exclusive | ||||
| #SBATCH --job-name="st.""" + arg + r""".bp" | #SBATCH --job-name="st.""" + arg + r""".bp" | ||||
| #SBATCH --partition=tlong | |||||
| #SBATCH --partition=court | |||||
| #SBATCH --mail-type=ALL | #SBATCH --mail-type=ALL | ||||
| #SBATCH --mail-user=jajupmochi@gmail.com | #SBATCH --mail-user=jajupmochi@gmail.com | ||||
| #SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| # | # | ||||
| #SBATCH --ntasks=1 | #SBATCH --ntasks=1 | ||||
| #SBATCH --nodes=1 | #SBATCH --nodes=1 | ||||
| #SBATCH --cpus-per-task=1 | #SBATCH --cpus-per-task=1 | ||||
| #SBATCH --time=300:00:00 | |||||
| #SBATCH --time=48:00:00 | |||||
| #SBATCH --mem-per-cpu=4000 | #SBATCH --mem-per-cpu=4000 | ||||
| srun hostname | srun hostname | ||||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
| srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg | |||||
| cd """ + cur_path + r""" | |||||
| echo Working directory : $PWD | |||||
| echo Local work dir : $LOCAL_WORK_DIR | |||||
| python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg | |||||
| script = script.strip() | script = script.strip() | ||||
| script = re.sub('\n\t+', '\n', script) | script = re.sub('\n\t+', '\n', script) | ||||
| script = re.sub('\n +', '\n', script) | script = re.sub('\n +', '\n', script) | ||||
| return script | return script | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||||
| os.makedirs('outputs/', exist_ok=True) | |||||
| os.makedirs('errors/', exist_ok=True) | |||||
| ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: | |||||
| job_script = get_job_script(ds_name) | job_script = get_job_script(ds_name) | ||||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | command = 'sbatch <<EOF\n' + job_script + '\nEOF' | ||||
| # print(command) | # print(command) | ||||
| @@ -325,6 +325,22 @@ def dichotomous_permutation(arr, layer=0): | |||||
| # return new_arr | # return new_arr | ||||
| def mix_param_grids(list_of_grids): | |||||
| mixed_grids = [] | |||||
| not_finished = [True] * len(list_of_grids) | |||||
| idx = 0 | |||||
| while sum(not_finished) > 0: | |||||
| for g_idx, grid in enumerate(list_of_grids): | |||||
| if idx < len(grid): | |||||
| mixed_grids.append(grid[idx]) | |||||
| else: | |||||
| not_finished[g_idx] = False | |||||
| idx += 1 | |||||
| return mixed_grids | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| root_dir = 'outputs/CRIANN/' | root_dir = 'outputs/CRIANN/' | ||||
| # for dir_ in sorted(os.listdir(root_dir)): | # for dir_ in sorted(os.listdir(root_dir)): | ||||
| @@ -337,4 +353,4 @@ if __name__ == '__main__': | |||||
| # get_relative_errors(save_dir) | # get_relative_errors(save_dir) | ||||
| # except Exception as exp: | # except Exception as exp: | ||||
| # print('An exception occured when running this experiment:') | # print('An exception occured when running this experiment:') | ||||
| # print(repr(exp)) | |||||
| # print(repr(exp)) | |||||
| @@ -0,0 +1 @@ | |||||
| from gklearn.ged.model.ged_model import GEDModel | |||||
| @@ -0,0 +1,43 @@ | |||||
| import numpy as np | |||||
| def sum_squares(a, b): | |||||
| """ | |||||
| Return the sum of squares of the difference between a and b, aka MSE | |||||
| """ | |||||
| return np.sum([(a[i] - b[i])**2 for i in range(len(a))]) | |||||
| def euclid_d(x, y): | |||||
| """ | |||||
| 1D euclidean distance | |||||
| """ | |||||
| return np.sqrt((x-y)**2) | |||||
| def man_d(x, y): | |||||
| """ | |||||
| 1D manhattan distance | |||||
| """ | |||||
| return np.abs((x-y)) | |||||
| def classif_d(x, y): | |||||
| """ | |||||
| Function adapted to classification problems | |||||
| """ | |||||
| return np.array(0 if x == y else 1) | |||||
| def rmse(pred, ground_truth): | |||||
| import numpy as np | |||||
| return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth)) | |||||
| def accuracy(pred, ground_truth): | |||||
| import numpy as np | |||||
| return np.mean([a == b for a, b in zip(pred, ground_truth)]) | |||||
| def rbf_k(D, sigma=1): | |||||
| return np.exp(-(D**2)/sigma) | |||||
| @@ -0,0 +1,97 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Thu May 5 14:02:17 2022 | |||||
| @author: ljia | |||||
| """ | |||||
| import sys | |||||
| from gklearn.ged.model.distances import euclid_d | |||||
| from gklearn.ged.util import pairwise_ged, get_nb_edit_operations | |||||
| from gklearn.utils import get_iters | |||||
| def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs): | |||||
| """ | |||||
| Compute GED between two graph according to edit_cost | |||||
| """ | |||||
| ged_options = {'edit_cost': 'CONSTANT', | |||||
| 'method': method, | |||||
| 'edit_cost_constants': edit_cost} | |||||
| node_labels = kwargs.get('node_labels', []) | |||||
| edge_labels = kwargs.get('edge_labels', []) | |||||
| dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) | |||||
| n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels) | |||||
| return dis, n_eo_tmp | |||||
| def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs): | |||||
| N = len(Gn) | |||||
| G_pairs = [] | |||||
| for i in range(N): | |||||
| for j in range(i, N): | |||||
| G_pairs.append([i, j]) | |||||
| return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs) | |||||
| def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs): | |||||
| """ | |||||
| Compute GED between all indexes in G_pairs given edit_cost | |||||
| :return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations | |||||
| """ | |||||
| ged_vec = [] | |||||
| n_edit_operations = [] | |||||
| for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose): | |||||
| [i, j] = G_pairs[k] | |||||
| dis, n_eo_tmp = compute_ged( | |||||
| Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs) | |||||
| ged_vec.append(dis) | |||||
| n_edit_operations.append(n_eo_tmp) | |||||
| return ged_vec, n_edit_operations | |||||
| def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs): | |||||
| import numpy as np | |||||
| N = len(G_app) | |||||
| D_app = np.zeros((N, N)) | |||||
| for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N): | |||||
| for j, G2 in enumerate(G_app[i+1:], i+1): | |||||
| D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) | |||||
| D_app[j, i] = D_app[i, j] | |||||
| if (G_test is None): | |||||
| return D_app, edit_cost | |||||
| else: | |||||
| D_test = np.zeros((len(G_test), N)) | |||||
| for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)): | |||||
| for j, G2 in enumerate(G_app): | |||||
| D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) | |||||
| return D_app, D_test, edit_cost | |||||
| def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): | |||||
| import numpy as np | |||||
| edit_costs = np.random.rand(6) | |||||
| return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs) | |||||
| def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): | |||||
| edit_cost = [3, 3, 1, 3, 3, 1] | |||||
| return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs) | |||||
| def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d, | |||||
| mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): | |||||
| from gklearn.ged.models.optim_costs import compute_optimal_costs | |||||
| costs_optim = compute_optimal_costs( | |||||
| G_app, y_app, y_distance=y_distance, | |||||
| mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs) | |||||
| return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) | |||||
| def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): | |||||
| from gklearn.ged.optim_costs import get_optimal_costs_GH2020 | |||||
| costs_optim = get_optimal_costs_GH2020(**kwargs) | |||||
| return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) | |||||
| @@ -0,0 +1,724 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Thu May 5 09:42:30 2022 | |||||
| @author: ljia | |||||
| """ | |||||
| import sys | |||||
| import multiprocessing | |||||
| import time | |||||
| import numpy as np | |||||
| import networkx as nx | |||||
| # from abc import ABC, abstractmethod | |||||
| from sklearn.base import BaseEstimator # , TransformerMixin | |||||
| from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, | |||||
| from sklearn.exceptions import NotFittedError | |||||
| from gklearn.ged.model.distances import euclid_d | |||||
| from gklearn.ged.util import pairwise_ged, get_nb_edit_operations | |||||
| # from gklearn.utils import normalize_gram_matrix | |||||
| from gklearn.utils import get_iters | |||||
| class GEDModel(BaseEstimator): #, ABC): | |||||
| """The graph edit distance model class compatible with `scikit-learn`. | |||||
| Attributes | |||||
| ---------- | |||||
| _graphs : list | |||||
| Stores the input graphs on fit input data. | |||||
| Default format of the list objects is `NetworkX` graphs. | |||||
| **We don't guarantee that the input graphs remain unchanged during the | |||||
| computation.** | |||||
| References | |||||
| ---------- | |||||
| https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | |||||
| """ | |||||
| def __init__(self, | |||||
| ed_method='BIPARTITE', | |||||
| edit_cost_fun='CONSTANT', | |||||
| init_edit_cost_constants=[3, 3, 1, 3, 3, 1], | |||||
| optim_method='init', | |||||
| optim_options={'y_distance': euclid_d, 'mode': 'reg'}, | |||||
| node_labels=[], | |||||
| edge_labels=[], | |||||
| parallel=None, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| # normalize=True, | |||||
| copy_graphs=True, # make sure it is a full deep copy. and faster! | |||||
| verbose=2): | |||||
| """`__init__` for `GEDModel` object.""" | |||||
| # @todo: the default settings of the parameters are different from those in the self.compute method. | |||||
| # self._graphs = None | |||||
| self.ed_method = ed_method | |||||
| self.edit_cost_fun = edit_cost_fun | |||||
| self.init_edit_cost_constants = init_edit_cost_constants | |||||
| self.optim_method=optim_method | |||||
| self.optim_options=optim_options | |||||
| self.node_labels=node_labels | |||||
| self.edge_labels=edge_labels | |||||
| self.parallel = parallel | |||||
| self.n_jobs = n_jobs | |||||
| self.chunksize = chunksize | |||||
| # self.normalize = normalize | |||||
| self.copy_graphs = copy_graphs | |||||
| self.verbose = verbose | |||||
| # self._run_time = 0 | |||||
| # self._gram_matrix = None | |||||
| # self._gram_matrix_unnorm = None | |||||
| ########################################################################## | |||||
| # The following is the 1st paradigm to compute GED distance matrix, which is | |||||
| # compatible with `scikit-learn`. | |||||
| ########################################################################## | |||||
| def fit(self, X, y=None): | |||||
| """Fit a graph dataset for a transformer. | |||||
| Parameters | |||||
| ---------- | |||||
| X : iterable | |||||
| DESCRIPTION. | |||||
| y : None, optional | |||||
| There is no need of a target in a transformer, yet the `scikit-learn` | |||||
| pipeline API requires this parameter. | |||||
| Returns | |||||
| ------- | |||||
| object | |||||
| Returns self. | |||||
| """ | |||||
| # self._is_tranformed = False | |||||
| # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; | |||||
| self.clear_attributes() | |||||
| # Validate parameters for the transformer. | |||||
| self.validate_parameters() | |||||
| # Validate the input. | |||||
| self._graphs = self.validate_input(X) | |||||
| if y is not None: | |||||
| self._targets = y | |||||
| # self._targets = self.validate_input(y) | |||||
| # self._X = X | |||||
| # self._kernel = self._get_kernel_instance() | |||||
| # Return the transformer. | |||||
| return self | |||||
| def transform(self, X=None, return_dm_train=False): | |||||
| """Compute the graph kernel matrix between given and fitted data. | |||||
| Parameters | |||||
| ---------- | |||||
| X : TYPE | |||||
| DESCRIPTION. | |||||
| Raises | |||||
| ------ | |||||
| ValueError | |||||
| DESCRIPTION. | |||||
| Returns | |||||
| ------- | |||||
| None. | |||||
| """ | |||||
| # If `return_dm_train`, return the fitted GED distance matrix of training data. | |||||
| if return_dm_train: | |||||
| check_is_fitted(self, '_dm_train') | |||||
| self._is_transformed = True | |||||
| return self._dm_train # @todo: copy or not? | |||||
| # Check if method "fit" had been called. | |||||
| check_is_fitted(self, '_graphs') | |||||
| # Validate the input. | |||||
| Y = self.validate_input(X) | |||||
| # Transform: compute the graph kernel matrix. | |||||
| dis_matrix = self.compute_distance_matrix(Y) | |||||
| self._Y = Y | |||||
| # Self transform must appear before the diagonal call on normilization. | |||||
| self._is_transformed = True | |||||
| # if self.normalize: | |||||
| # X_diag, Y_diag = self.diagonals() | |||||
| # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||||
| # try: | |||||
| # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) | |||||
| # except: | |||||
| # raise | |||||
| # finally: | |||||
| # np.seterr(**old_settings) | |||||
| return dis_matrix | |||||
| def fit_transform(self, X, y=None, save_dm_train=False): | |||||
| """Fit and transform: compute GED distance matrix on the same data. | |||||
| Parameters | |||||
| ---------- | |||||
| X : list of graphs | |||||
| Input graphs. | |||||
| Returns | |||||
| ------- | |||||
| dis_matrix : numpy array, shape = [len(X), len(X)] | |||||
| The distance matrix of X. | |||||
| """ | |||||
| self.fit(X, y) | |||||
| # Compute edit cost constants. | |||||
| self.compute_edit_costs() | |||||
| # Transform: compute Gram matrix. | |||||
| dis_matrix = self.compute_distance_matrix() | |||||
| # # Normalize. | |||||
| # if self.normalize: | |||||
| # self._X_diag = np.diagonal(gram_matrix).copy() | |||||
| # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||||
| # try: | |||||
| # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) | |||||
| # except: | |||||
| # raise | |||||
| # finally: | |||||
| # np.seterr(**old_settings) | |||||
| if save_dm_train: | |||||
| self._dm_train = dis_matrix | |||||
| return dis_matrix | |||||
| def get_params(self): | |||||
| pass | |||||
| def set_params(self): | |||||
| pass | |||||
| def clear_attributes(self): # @todo: update | |||||
| # if hasattr(self, '_X_diag'): | |||||
| # delattr(self, '_X_diag') | |||||
| if hasattr(self, '_graphs'): | |||||
| delattr(self, '_graphs') | |||||
| if hasattr(self, '_Y'): | |||||
| delattr(self, '_Y') | |||||
| if hasattr(self, '_run_time'): | |||||
| delattr(self, '_run_time') | |||||
| def validate_parameters(self): | |||||
| """Validate all parameters for the transformer. | |||||
| Returns | |||||
| ------- | |||||
| None. | |||||
| """ | |||||
| if self.parallel is not None and self.parallel != 'imap_unordered': | |||||
| raise ValueError('Parallel mode is not set correctly.') | |||||
| if self.parallel == 'imap_unordered' and self.n_jobs is None: | |||||
| self.n_jobs = multiprocessing.cpu_count() | |||||
| def validate_input(self, X): | |||||
| """Validate the given input and raise errors if it is invalid. | |||||
| Parameters | |||||
| ---------- | |||||
| X : list | |||||
| The input to check. Should be a list of graph. | |||||
| Raises | |||||
| ------ | |||||
| ValueError | |||||
| Raise if the input is not correct. | |||||
| Returns | |||||
| ------- | |||||
| X : list | |||||
| The input. A list of graph. | |||||
| """ | |||||
| if X is None: | |||||
| raise ValueError('Please add graphs before computing.') | |||||
| elif not isinstance(X, list): | |||||
| raise ValueError('Cannot detect graphs. The input must be a list.') | |||||
| elif len(X) == 0: | |||||
| raise ValueError('The graph list given is empty. No computation will be performed.') | |||||
| return X | |||||
| def compute_distance_matrix(self, Y=None): | |||||
| """Compute the distance matrix between a given target graphs (Y) and | |||||
| the fitted graphs (X / self._graphs) or the distance matrix for the fitted | |||||
| graphs (X / self._graphs). | |||||
| Parameters | |||||
| ---------- | |||||
| Y : list of graphs, optional | |||||
| The target graphs. The default is None. If None kernel is computed | |||||
| between X and itself. | |||||
| Returns | |||||
| ------- | |||||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
| The computed kernel matrix. | |||||
| """ | |||||
| if Y is None: | |||||
| # Compute Gram matrix for self._graphs (X). | |||||
| dis_matrix = self._compute_X_distance_matrix() | |||||
| # self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||||
| else: | |||||
| # Compute kernel matrix between Y and self._graphs (X). | |||||
| start_time = time.time() | |||||
| if self.parallel == 'imap_unordered': | |||||
| dis_matrix = self._compute_distance_matrix_imap_unordered(Y) | |||||
| elif self.parallel is None: | |||||
| Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) | |||||
| graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy) | |||||
| self._run_time = time.time() - start_time | |||||
| if self.verbose: | |||||
| print('Distance matrix of size (%d, %d) built in %s seconds.' | |||||
| % (len(Y), len(self._graphs), self._run_time)) | |||||
| return dis_matrix | |||||
| def _compute_distance_matrix_series(self, X, Y): | |||||
| """Compute the GED distance matrix between two sets of graphs (X and Y) | |||||
| without parallelization. | |||||
| Parameters | |||||
| ---------- | |||||
| X, Y : list of graphs | |||||
| The input graphs. | |||||
| Returns | |||||
| ------- | |||||
| dis_matrix : numpy array, shape = [n_X, n_Y] | |||||
| The computed distance matrix. | |||||
| """ | |||||
| dis_matrix = np.zeros((len(X), len(Y))) | |||||
| for i_x, g_x in enumerate(X): | |||||
| for i_y, g_y in enumerate(Y): | |||||
| dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y) | |||||
| return dis_matrix | |||||
| def _compute_kernel_matrix_imap_unordered(self, Y): | |||||
| """Compute the kernel matrix between a given target graphs (Y) and | |||||
| the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||||
| Parameters | |||||
| ---------- | |||||
| Y : list of graphs, optional | |||||
| The target graphs. | |||||
| Returns | |||||
| ------- | |||||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
| The computed kernel matrix. | |||||
| """ | |||||
| raise Exception('Parallelization for kernel matrix is not implemented.') | |||||
| def diagonals(self): | |||||
| """Compute the kernel matrix diagonals of the fit/transformed data. | |||||
| Returns | |||||
| ------- | |||||
| X_diag : numpy array | |||||
| The diagonal of the kernel matrix between the fitted data. | |||||
| This consists of each element calculated with itself. | |||||
| Y_diag : numpy array | |||||
| The diagonal of the kernel matrix, of the transform. | |||||
| This consists of each element calculated with itself. | |||||
| """ | |||||
| # Check if method "fit" had been called. | |||||
| check_is_fitted(self, ['_graphs']) | |||||
| # Check if the diagonals of X exist. | |||||
| try: | |||||
| check_is_fitted(self, ['_X_diag']) | |||||
| except NotFittedError: | |||||
| # Compute diagonals of X. | |||||
| self._X_diag = np.empty(shape=(len(self._graphs),)) | |||||
| graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| for i, x in enumerate(graphs): | |||||
| self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | |||||
| try: | |||||
| # If transform has happened, return both diagonals. | |||||
| check_is_fitted(self, ['_Y']) | |||||
| self._Y_diag = np.empty(shape=(len(self._Y),)) | |||||
| Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) | |||||
| for (i, y) in enumerate(Y): | |||||
| self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | |||||
| return self._X_diag, self._Y_diag | |||||
| except NotFittedError: | |||||
| # Else just return both X_diag | |||||
| return self._X_diag | |||||
| # @abstractmethod | |||||
| def pairwise_distance(self, x, y): | |||||
| """Compute pairwise kernel between two graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| x, y : NetworkX Graph. | |||||
| Graphs bewteen which the kernel is computed. | |||||
| Returns | |||||
| ------- | |||||
| kernel: float | |||||
| The computed kernel. | |||||
| # Notes | |||||
| # ----- | |||||
| # This method is abstract and must be implemented by a subclass. | |||||
| """ | |||||
| raise NotImplementedError('Pairwise kernel computation is not implemented!') | |||||
| def compute_edit_costs(self, Y=None, Y_targets=None): | |||||
| """Compute edit cost constants. When optimizing method is `fiited`, | |||||
| apply Jia2021's metric learning method by using a given target graphs (Y) | |||||
| the fitted graphs (X / self._graphs). | |||||
| Parameters | |||||
| ---------- | |||||
| Y : TYPE, optional | |||||
| DESCRIPTION. The default is None. | |||||
| Returns | |||||
| ------- | |||||
| None. | |||||
| """ | |||||
| # Get or compute. | |||||
| if self.optim_method == 'random': | |||||
| self._edit_cost_constants = np.random.rand(6) | |||||
| elif self.optim_method == 'init': | |||||
| self._edit_cost_constants = self.init_edit_cost_constants | |||||
| elif self.optim_method == 'expert': | |||||
| self._edit_cost_constants = [3, 3, 1, 3, 3, 1] | |||||
| elif self.optim_method == 'fitted': # Jia2021 method | |||||
| # Get proper inputs. | |||||
| if Y is None: | |||||
| check_is_fitted(self, ['_graphs']) | |||||
| check_is_fitted(self, ['_targets']) | |||||
| graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| targets = self._targets | |||||
| else: | |||||
| graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) | |||||
| targets = Y_targets | |||||
| # Get optimization options. | |||||
| node_labels = self.node_labels | |||||
| edge_labels = self.edge_labels | |||||
| unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) | |||||
| from gklearn.ged.model.optim_costs import compute_optimal_costs | |||||
| self._edit_cost_constants = compute_optimal_costs( | |||||
| graphs, targets, | |||||
| node_labels=node_labels, edge_labels=edge_labels, | |||||
| unlabeled=unlabeled, ed_method=self.ed_method, | |||||
| verbose=(self.verbose >= 2), | |||||
| **self.optim_options) | |||||
| ########################################################################## | |||||
| # The following is the 2nd paradigm to compute kernel matrix. It is | |||||
| # simplified and not compatible with `scikit-learn`. | |||||
| ########################################################################## | |||||
| # def compute(self, *graphs, **kwargs): | |||||
| # self.parallel = kwargs.get('parallel', 'imap_unordered') | |||||
| # self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||||
| # self.normalize = kwargs.get('normalize', True) | |||||
| # self.verbose = kwargs.get('verbose', 2) | |||||
| # self.copy_graphs = kwargs.get('copy_graphs', True) | |||||
| # self.save_unnormed = kwargs.get('save_unnormed', True) | |||||
| # self.validate_parameters() | |||||
| # # If the inputs is a list of graphs. | |||||
| # if len(graphs) == 1: | |||||
| # if not isinstance(graphs[0], list): | |||||
| # raise Exception('Cannot detect graphs.') | |||||
| # elif len(graphs[0]) == 0: | |||||
| # raise Exception('The graph list given is empty. No computation was performed.') | |||||
| # else: | |||||
| # if self.copy_graphs: | |||||
| # self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||||
| # else: | |||||
| # self._graphs = graphs | |||||
| # self._gram_matrix = self._compute_gram_matrix() | |||||
| # if self.save_unnormed: | |||||
| # self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||||
| # if self.normalize: | |||||
| # self._gram_matrix = normalize_gram_matrix(self._gram_matrix) | |||||
| # return self._gram_matrix, self._run_time | |||||
| # elif len(graphs) == 2: | |||||
| # # If the inputs are two graphs. | |||||
| # if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): | |||||
| # if self.copy_graphs: | |||||
| # G0, G1 = graphs[0].copy(), graphs[1].copy() | |||||
| # else: | |||||
| # G0, G1 = graphs[0], graphs[1] | |||||
| # kernel = self._compute_single_kernel(G0, G1) | |||||
| # return kernel, self._run_time | |||||
| # # If the inputs are a graph and a list of graphs. | |||||
| # elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): | |||||
| # if self.copy_graphs: | |||||
| # g1 = graphs[0].copy() | |||||
| # g_list = [g.copy() for g in graphs[1]] | |||||
| # kernel_list = self._compute_kernel_list(g1, g_list) | |||||
| # else: | |||||
| # kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) | |||||
| # return kernel_list, self._run_time | |||||
| # elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): | |||||
| # if self.copy_graphs: | |||||
| # g1 = graphs[1].copy() | |||||
| # g_list = [g.copy() for g in graphs[0]] | |||||
| # kernel_list = self._compute_kernel_list(g1, g_list) | |||||
| # else: | |||||
| # kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) | |||||
| # return kernel_list, self._run_time | |||||
| # else: | |||||
| # raise Exception('Cannot detect graphs.') | |||||
| # elif len(graphs) == 0 and self._graphs is None: | |||||
| # raise Exception('Please add graphs before computing.') | |||||
| # else: | |||||
| # raise Exception('Cannot detect graphs.') | |||||
| # def normalize_gm(self, gram_matrix): | |||||
| # import warnings | |||||
| # warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) | |||||
| # diag = gram_matrix.diagonal().copy() | |||||
| # for i in range(len(gram_matrix)): | |||||
| # for j in range(i, len(gram_matrix)): | |||||
| # gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||||
| # gram_matrix[j][i] = gram_matrix[i][j] | |||||
| # return gram_matrix | |||||
| # def compute_distance_matrix(self): | |||||
| # if self._gram_matrix is None: | |||||
| # raise Exception('Please compute the Gram matrix before computing distance matrix.') | |||||
| # dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) | |||||
| # for i in range(len(self._gram_matrix)): | |||||
| # for j in range(i, len(self._gram_matrix)): | |||||
| # dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] | |||||
| # if dis < 0: | |||||
| # if dis > -1e-10: | |||||
| # dis = 0 | |||||
| # else: | |||||
| # raise ValueError('The distance is negative.') | |||||
| # dis_mat[i, j] = np.sqrt(dis) | |||||
| # dis_mat[j, i] = dis_mat[i, j] | |||||
| # dis_max = np.max(np.max(dis_mat)) | |||||
| # dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||||
| # dis_mean = np.mean(np.mean(dis_mat)) | |||||
| # return dis_mat, dis_max, dis_min, dis_mean | |||||
| def _compute_X_distance_matrix(self): | |||||
| start_time = time.time() | |||||
| if self.parallel == 'imap_unordered': | |||||
| dis_matrix = self._compute_X_dm_imap_unordered() | |||||
| elif self.parallel is None: | |||||
| graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| dis_matrix = self._compute_X_dm_series(graphs) | |||||
| else: | |||||
| raise Exception('Parallel mode is not set correctly.') | |||||
| self._run_time = time.time() - start_time | |||||
| if self.verbose: | |||||
| print('Distance matrix of size %d built in %s seconds.' | |||||
| % (len(self._graphs), self._run_time)) | |||||
| return dis_matrix | |||||
| def _compute_X_dm_series(self, graphs): | |||||
| N = len(graphs) | |||||
| dis_matrix = np.zeros((N, N)) | |||||
| for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)): | |||||
| for j, G2 in enumerate(graphs[i+1:], i+1): | |||||
| dis_matrix[i, j], _ = self.compute_ged(G1, G2) | |||||
| dis_matrix[j, i] = dis_matrix[i, j] | |||||
| return dis_matrix | |||||
| def _compute_X_dm_imap_unordered(self, graphs): | |||||
| pass | |||||
| def compute_ged(self, Gi, Gj, **kwargs): | |||||
| """ | |||||
| Compute GED between two graph according to edit_cost. | |||||
| """ | |||||
| ged_options = {'edit_cost': self.edit_cost_fun, | |||||
| 'method': self.ed_method, | |||||
| 'edit_cost_constants': self._edit_cost_constants} | |||||
| dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) | |||||
| n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, | |||||
| edit_cost=self.edit_cost_fun, | |||||
| node_labels=self.node_labels, | |||||
| edge_labels=self.edge_labels) | |||||
| return dis, n_eo_tmp | |||||
| # def _compute_kernel_list(self, g1, g_list): | |||||
| # start_time = time.time() | |||||
| # if self.parallel == 'imap_unordered': | |||||
| # kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) | |||||
| # elif self.parallel is None: | |||||
| # kernel_list = self._compute_kernel_list_series(g1, g_list) | |||||
| # else: | |||||
| # raise Exception('Parallel mode is not set correctly.') | |||||
| # self._run_time = time.time() - start_time | |||||
| # if self.verbose: | |||||
| # print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' | |||||
| # % (len(g_list), self._run_time)) | |||||
| # return kernel_list | |||||
| # def _compute_kernel_list_series(self, g1, g_list): | |||||
| # pass | |||||
| # def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||||
| # pass | |||||
| # def _compute_single_kernel(self, g1, g2): | |||||
| # start_time = time.time() | |||||
| # kernel = self._compute_single_kernel_series(g1, g2) | |||||
| # self._run_time = time.time() - start_time | |||||
| # if self.verbose: | |||||
| # print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) | |||||
| # return kernel | |||||
| # def _compute_single_kernel_series(self, g1, g2): | |||||
| # pass | |||||
| def is_graph(self, graph): | |||||
| if isinstance(graph, nx.Graph): | |||||
| return True | |||||
| if isinstance(graph, nx.DiGraph): | |||||
| return True | |||||
| if isinstance(graph, nx.MultiGraph): | |||||
| return True | |||||
| if isinstance(graph, nx.MultiDiGraph): | |||||
| return True | |||||
| return False | |||||
| @property | |||||
| def graphs(self): | |||||
| return self._graphs | |||||
| # @property | |||||
| # def parallel(self): | |||||
| # return self.parallel | |||||
| # @property | |||||
| # def n_jobs(self): | |||||
| # return self.n_jobs | |||||
| # @property | |||||
| # def verbose(self): | |||||
| # return self.verbose | |||||
| # @property | |||||
| # def normalize(self): | |||||
| # return self.normalize | |||||
| @property | |||||
| def run_time(self): | |||||
| return self._run_time | |||||
| @property | |||||
| def dis_matrix(self): | |||||
| return self._dis_matrix | |||||
| @dis_matrix.setter | |||||
| def dis_matrix(self, value): | |||||
| self._dis_matrix = value | |||||
| # @property | |||||
| # def gram_matrix_unnorm(self): | |||||
| # return self._gram_matrix_unnorm | |||||
| # @gram_matrix_unnorm.setter | |||||
| # def gram_matrix_unnorm(self, value): | |||||
| # self._gram_matrix_unnorm = value | |||||
| @@ -0,0 +1,149 @@ | |||||
| import numpy as np | |||||
| from gklearn.ged.model.distances import sum_squares, euclid_d | |||||
| from gklearn.ged.model.ged_com import compute_geds | |||||
| def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec): | |||||
| """ | |||||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||||
| ! take care that nb_cost_mat do not contains 0 lines | |||||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||||
| :param dis_k_vec: The N distances to fit | |||||
| """ | |||||
| import cvxpy as cp | |||||
| import numpy as np | |||||
| MAX_SAMPLE = 1000 | |||||
| nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat]) | |||||
| dis_k_vec = np.array(dis_k_vec) | |||||
| # dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec) | |||||
| # import pickle | |||||
| # pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb')) | |||||
| N = nb_cost_mat_m.shape[0] | |||||
| sub_sample = np.random.permutation(np.arange(N)) | |||||
| sub_sample = sub_sample[:MAX_SAMPLE] | |||||
| x = cp.Variable(nb_cost_mat_m.shape[1]) | |||||
| cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample]) | |||||
| prob = cp.Problem(cp.Minimize(cost), [x >= 0]) | |||||
| prob.solve() | |||||
| edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0] | |||||
| edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new] | |||||
| residual = prob.value | |||||
| return edit_costs_new, residual | |||||
| def optimize_costs_classif_unlabeled(nb_cost_mat, Y): | |||||
| """ | |||||
| Optimize edit costs to fit dis_k_vec according to edit operations in | |||||
| nb_cost_mat | |||||
| ! take care that nb_cost_mat do not contains 0 lines | |||||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit | |||||
| operations for each pair of graph | |||||
| :param dis_k_vec: {-1,1}^N vector of common classes | |||||
| """ | |||||
| # import cvxpy as cp | |||||
| from ml import reg_log | |||||
| # import pickle | |||||
| # pickle.dump([nb_cost_mat, Y], open('debug', 'wb')) | |||||
| nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] | |||||
| for x in nb_cost_mat]) | |||||
| w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True) | |||||
| edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0] | |||||
| residual = J[-1] | |||||
| return edit_costs_new, residual | |||||
| def optimize_costs_classif(nb_cost_mat, Y): | |||||
| """ | |||||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||||
| ! take care that nb_cost_mat do not contains 0 lines | |||||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||||
| :param dis_k_vec: {-1,1}^N vector of common classes | |||||
| """ | |||||
| #import pickle | |||||
| # pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb")) | |||||
| from ml import reg_log | |||||
| w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True) | |||||
| return w, J[-1] | |||||
| def optimize_costs(nb_cost_mat, dis_k_vec): | |||||
| """ | |||||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||||
| ! take care that nb_cost_mat do not contains 0 lines | |||||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||||
| :param dis_k_vec: The N distances to fit | |||||
| """ | |||||
| import cvxpy as cp | |||||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||||
| cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec) | |||||
| constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||||
| prob = cp.Problem(cp.Minimize(cost), constraints) | |||||
| prob.solve() | |||||
| edit_costs_new = x.value | |||||
| residual = prob.value | |||||
| return edit_costs_new, residual | |||||
| def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1], | |||||
| y_distance=euclid_d, | |||||
| mode='reg', unlabeled=False, | |||||
| ed_method='BIPARTITE', | |||||
| verbose=True, | |||||
| **kwargs): | |||||
| N = len(y) | |||||
| G_pairs = [] | |||||
| distances_vec = [] | |||||
| for i in range(N): | |||||
| for j in range(i+1, N): | |||||
| G_pairs.append([i, j]) | |||||
| distances_vec.append(y_distance(y[i], y[j])) | |||||
| ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method, | |||||
| verbose=verbose, **kwargs) | |||||
| residual_list = [sum_squares(ged_vec_init, distances_vec)] | |||||
| if (mode == 'reg'): | |||||
| if unlabeled: | |||||
| method_optim = optimize_costs_unlabeled | |||||
| else: | |||||
| method_optim = optimize_costs | |||||
| elif (mode == 'classif'): | |||||
| if unlabeled: | |||||
| method_optim = optimize_costs_classif_unlabeled | |||||
| else: | |||||
| method_optim = optimize_costs_classif | |||||
| ite_max = 5 | |||||
| for i in range(ite_max): | |||||
| if verbose: | |||||
| print('ite', i + 1, '/', ite_max, ':') | |||||
| # compute GEDs and numbers of edit operations. | |||||
| edit_costs_new, residual = method_optim( | |||||
| np.array(n_edit_operations), distances_vec) | |||||
| ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method, | |||||
| verbose=verbose, **kwargs) | |||||
| residual_list.append(sum_squares(ged_vec, distances_vec)) | |||||
| return edit_costs_new | |||||
| def get_optimal_costs_GH2020(**kwargs): | |||||
| import pickle | |||||
| import os | |||||
| dir_root = 'cj/output/' | |||||
| ds_name = kwargs.get('ds_name') | |||||
| nb_trial = kwargs.get('nb_trial') | |||||
| file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl') | |||||
| with open(file_name, 'rb') as f: | |||||
| edit_costs = pickle.load(f) | |||||
| return edit_costs | |||||
| @@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo | |||||
| g = listID[0] | g = listID[0] | ||||
| h = listID[1] | h = listID[1] | ||||
| dis_min = np.inf | dis_min = np.inf | ||||
| # print('------------------------------------------') | |||||
| for i in range(0, repeats): | for i in range(0, repeats): | ||||
| ged_env.run_method(g, h) | ged_env.run_method(g, h) | ||||
| upper = ged_env.get_upper_bound(g, h) | upper = ged_env.get_upper_bound(g, h) | ||||
| dis = upper | dis = upper | ||||
| # print(dis) | |||||
| if dis < dis_min: | if dis < dis_min: | ||||
| dis_min = dis | dis_min = dis | ||||
| pi_forward = ged_env.get_forward_map(g, h) | pi_forward = ged_env.get_forward_map(g, h) | ||||
| @@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||||
| return ged_vec, ged_mat, n_edit_operations | return ged_vec, ged_mat, n_edit_operations | ||||
| def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): | |||||
| #%% | |||||
| def compute_geds(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| permute_nodes=False, | |||||
| random_state=None, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Compute graph edit distance matrix using GEDLIB. | |||||
| """ | |||||
| if permute_nodes: | |||||
| return _compute_geds_with_permutation(graphs, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=repeats, | |||||
| random_state=random_state, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| else: | |||||
| return _compute_geds_without_permutation(graphs, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=repeats, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| #%% | |||||
| def _compute_geds_with_permutation(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| random_state=None, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| from gklearn.utils.utils import nx_permute_nodes | |||||
| # Initialze variables. | |||||
| ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf) | |||||
| np.fill_diagonal(ged_mat_optim, 0) | |||||
| len_itr = int(len(graphs) * (len(graphs) - 1) / 2) | |||||
| ged_vec = [0] * len_itr | |||||
| n_edit_operations = [0] * len_itr | |||||
| # for each repeats: | |||||
| for i in range(0, repeats): | |||||
| # Permutate nodes. | |||||
| graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs] | |||||
| out = _compute_geds_without_permutation(graphs_pmut, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=1, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| # Compare current results with the best one. | |||||
| idx_cnt = 0 | |||||
| for i in range(len(graphs)): | |||||
| for j in range(i + 1, len(graphs)): | |||||
| if out[1][i, j] < ged_mat_optim[i ,j]: | |||||
| ged_mat_optim[i, j] = out[1][i, j] | |||||
| ged_mat_optim[j, i] = out[1][j, i] | |||||
| ged_vec[idx_cnt] = out[0][idx_cnt] | |||||
| n_edit_operations[idx_cnt] = out[2][idx_cnt] | |||||
| idx_cnt += 1 | |||||
| return ged_vec, ged_mat_optim, n_edit_operations | |||||
| def _compute_geds_without_permutation(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| from gklearn.gedlib import librariesImport, gedlibpy | from gklearn.gedlib import librariesImport, gedlibpy | ||||
| # initialize ged env. | # initialize ged env. | ||||
| ged_env = gedlibpy.GEDEnv() | ged_env = gedlibpy.GEDEnv() | ||||
| ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | ||||
| for g in graphs: | for g in graphs: | ||||
| ged_env.add_nx_graph(g, '') | ged_env.add_nx_graph(g, '') | ||||
| listID = ged_env.get_all_graph_ids() | listID = ged_env.get_all_graph_ids() | ||||
| @@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||||
| dis = upper | dis = upper | ||||
| # make the map label correct (label remove map as np.inf) | # make the map label correct (label remove map as np.inf) | ||||
| # Attention: using node indices instead of NetworkX node labels (as | |||||
| # implemented here) may cause several issues: | |||||
| # - Fail if NetworkX node labels are not consecutive integers; | |||||
| # - Return wrong mappings if nodes are permutated (e.g., by using | |||||
| # `gklearn.utis.utils.nx_permute_nodes()`.) | |||||
| nodes1 = [n for n in g1.nodes()] | nodes1 = [n for n in g1.nodes()] | ||||
| nodes2 = [n for n in g2.nodes()] | nodes2 = [n for n in g2.nodes()] | ||||
| nb1 = nx.number_of_nodes(g1) | nb1 = nx.number_of_nodes(g1) | ||||
| @@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||||
| pi_forward_min = pi_forward | pi_forward_min = pi_forward | ||||
| pi_backward_min = pi_backward | pi_backward_min = pi_backward | ||||
| # print('-----') | |||||
| # print(pi_forward_min) | |||||
| # print(pi_backward_min) | |||||
| return dis_min, pi_forward_min, pi_backward_min | return dis_min, pi_forward_min, pi_backward_min | ||||
| def label_costs_to_matrix(costs, nb_labels): | |||||
| """Reform a label cost vector to a matrix. | |||||
| #%% | |||||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): | |||||
| """Calculate the numbers of the occurence of each edit operation in a given | |||||
| edit path. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| costs : numpy.array | |||||
| The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. | |||||
| nb_labels : integer | |||||
| Number of labels. | |||||
| g1 : TYPE | |||||
| DESCRIPTION. | |||||
| g2 : TYPE | |||||
| DESCRIPTION. | |||||
| forward_map : TYPE | |||||
| DESCRIPTION. | |||||
| backward_map : TYPE | |||||
| DESCRIPTION. | |||||
| edit_cost : TYPE, optional | |||||
| DESCRIPTION. The default is None. | |||||
| is_cml : TYPE, optional | |||||
| DESCRIPTION. The default is False. | |||||
| **kwargs : TYPE | |||||
| DESCRIPTION. | |||||
| Raises | |||||
| ------ | |||||
| Exception | |||||
| DESCRIPTION. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| cost_matrix : numpy.array. | |||||
| The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. | |||||
| TYPE | |||||
| DESCRIPTION. | |||||
| Notes | |||||
| ----- | |||||
| Attention: when implementing a function to get the numbers of edit | |||||
| operations, make sure that: | |||||
| - It does not fail if NetworkX node labels are not consecutive integers; | |||||
| - It returns correct results if nodes are permutated (e.g., by using | |||||
| `gklearn.utis.utils.nx_permute_nodes()`.) | |||||
| Generally speaking, it means you need to distinguish the NetworkX label of | |||||
| a node from the position (index) of that node in the node list. | |||||
| """ | """ | ||||
| # Initialize label cost matrix. | |||||
| cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) | |||||
| i = 0 | |||||
| # Costs of insertions. | |||||
| for col in range(1, nb_labels + 1): | |||||
| cost_matrix[0, col] = costs[i] | |||||
| i += 1 | |||||
| # Costs of deletions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| cost_matrix[row, 0] = costs[i] | |||||
| i += 1 | |||||
| # Costs of substitutions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| for col in range(row + 1, nb_labels + 1): | |||||
| cost_matrix[row, col] = costs[i] | |||||
| cost_matrix[col, row] = costs[i] | |||||
| i += 1 | |||||
| return cost_matrix | |||||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): | |||||
| if is_cml: | if is_cml: | ||||
| if edit_cost == 'CONSTANT': | if edit_cost == 'CONSTANT': | ||||
| node_labels = kwargs.get('node_labels', []) | node_labels = kwargs.get('node_labels', []) | ||||
| @@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||||
| return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | ||||
| #%% | |||||
| def label_costs_to_matrix(costs, nb_labels): | |||||
| """Reform a label cost vector to a matrix. | |||||
| Parameters | |||||
| ---------- | |||||
| costs : numpy.array | |||||
| The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. | |||||
| nb_labels : integer | |||||
| Number of labels. | |||||
| Returns | |||||
| ------- | |||||
| cost_matrix : numpy.array. | |||||
| The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. | |||||
| """ | |||||
| # Initialize label cost matrix. | |||||
| cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) | |||||
| i = 0 | |||||
| # Costs of insertions. | |||||
| for col in range(1, nb_labels + 1): | |||||
| cost_matrix[0, col] = costs[i] | |||||
| i += 1 | |||||
| # Costs of deletions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| cost_matrix[row, 0] = costs[i] | |||||
| i += 1 | |||||
| # Costs of substitutions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| for col in range(row + 1, nb_labels + 1): | |||||
| cost_matrix[row, col] = costs[i] | |||||
| cost_matrix[col, row] = costs[i] | |||||
| i += 1 | |||||
| return cost_matrix | |||||
| #%% | |||||
| def ged_options_to_string(options): | def ged_options_to_string(options): | ||||
| opt_str = ' ' | opt_str = ' ' | ||||
| for key, val in options.items(): | for key, val in options.items(): | ||||
| @@ -32,7 +32,13 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | ||||
| """ | """ | ||||
| def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): | |||||
| def __init__(self, | |||||
| parallel=None, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| normalize=True, | |||||
| copy_graphs=True, # make sure it is a full deep copy. and faster! | |||||
| verbose=2): | |||||
| """`__init__` for `GraphKernel` object.""" | """`__init__` for `GraphKernel` object.""" | ||||
| # @todo: the default settings of the parameters are different from those in the self.compute method. | # @todo: the default settings of the parameters are different from those in the self.compute method. | ||||
| # self._graphs = None | # self._graphs = None | ||||
| @@ -40,6 +46,7 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| self.n_jobs = n_jobs | self.n_jobs = n_jobs | ||||
| self.chunksize = chunksize | self.chunksize = chunksize | ||||
| self.normalize = normalize | self.normalize = normalize | ||||
| self.copy_graphs = copy_graphs | |||||
| self.verbose = verbose | self.verbose = verbose | ||||
| # self._run_time = 0 | # self._run_time = 0 | ||||
| # self._gram_matrix = None | # self._gram_matrix = None | ||||
| @@ -90,7 +97,7 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| return self | return self | ||||
| def transform(self, X): | |||||
| def transform(self, X=None, load_gm_train=False): | |||||
| """Compute the graph kernel matrix between given and fitted data. | """Compute the graph kernel matrix between given and fitted data. | ||||
| Parameters | Parameters | ||||
| @@ -108,6 +115,12 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| None. | None. | ||||
| """ | """ | ||||
| # If `load_gm_train`, load Gram matrix of training data. | |||||
| if load_gm_train: | |||||
| check_is_fitted(self, '_gm_train') | |||||
| self._is_transformed = True | |||||
| return self._gm_train # @todo: copy or not? | |||||
| # Check if method "fit" had been called. | # Check if method "fit" had been called. | ||||
| check_is_fitted(self, '_graphs') | check_is_fitted(self, '_graphs') | ||||
| @@ -133,8 +146,7 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| return kernel_matrix | return kernel_matrix | ||||
| def fit_transform(self, X): | |||||
| def fit_transform(self, X, save_gm_train=False): | |||||
| """Fit and transform: compute Gram matrix on the same data. | """Fit and transform: compute Gram matrix on the same data. | ||||
| Parameters | Parameters | ||||
| @@ -164,6 +176,9 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| finally: | finally: | ||||
| np.seterr(**old_settings) | np.seterr(**old_settings) | ||||
| if save_gm_train: | |||||
| self._gm_train = gram_matrix | |||||
| return gram_matrix | return gram_matrix | ||||
| @@ -260,7 +275,9 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) | kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) | ||||
| elif self.parallel is None: | elif self.parallel is None: | ||||
| kernel_matrix = self._compute_kernel_matrix_series(Y) | |||||
| Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) | |||||
| graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| kernel_matrix = self._compute_kernel_matrix_series(Y_copy, graphs_copy) | |||||
| self._run_time = time.time() - start_time | self._run_time = time.time() - start_time | ||||
| if self.verbose: | if self.verbose: | ||||
| @@ -270,26 +287,25 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| return kernel_matrix | return kernel_matrix | ||||
| def _compute_kernel_matrix_series(self, Y): | |||||
| """Compute the kernel matrix between a given target graphs (Y) and | |||||
| the fitted graphs (X / self._graphs) without parallelization. | |||||
| def _compute_kernel_matrix_series(self, X, Y): | |||||
| """Compute the kernel matrix between two sets of graphs (X and Y) without parallelization. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| Y : list of graphs, optional | |||||
| The target graphs. | |||||
| X, Y : list of graphs | |||||
| The input graphs. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
| kernel_matrix : numpy array, shape = [n_X, n_Y] | |||||
| The computed kernel matrix. | The computed kernel matrix. | ||||
| """ | """ | ||||
| kernel_matrix = np.zeros((len(Y), len(self._graphs))) | |||||
| kernel_matrix = np.zeros((len(X), len(Y))) | |||||
| for i_y, g_y in enumerate(Y): | |||||
| for i_x, g_x in enumerate(self._graphs): | |||||
| kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) | |||||
| for i_x, g_x in enumerate(X): | |||||
| for i_y, g_y in enumerate(Y): | |||||
| kernel_matrix[i_x, i_y] = self.pairwise_kernel(g_x, g_y) | |||||
| return kernel_matrix | return kernel_matrix | ||||
| @@ -335,14 +351,16 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| except NotFittedError: | except NotFittedError: | ||||
| # Compute diagonals of X. | # Compute diagonals of X. | ||||
| self._X_diag = np.empty(shape=(len(self._graphs),)) | self._X_diag = np.empty(shape=(len(self._graphs),)) | ||||
| for i, x in enumerate(self._graphs): | |||||
| graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| for i, x in enumerate(graphs): | |||||
| self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | ||||
| try: | try: | ||||
| # If transform has happened, return both diagonals. | # If transform has happened, return both diagonals. | ||||
| check_is_fitted(self, ['_Y']) | check_is_fitted(self, ['_Y']) | ||||
| self._Y_diag = np.empty(shape=(len(self._Y),)) | self._Y_diag = np.empty(shape=(len(self._Y),)) | ||||
| for (i, y) in enumerate(self._Y): | |||||
| Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) | |||||
| for (i, y) in enumerate(Y): | |||||
| self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | ||||
| return self._X_diag, self._Y_diag | return self._X_diag, self._Y_diag | ||||
| @@ -484,7 +502,8 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| if self.parallel == 'imap_unordered': | if self.parallel == 'imap_unordered': | ||||
| gram_matrix = self._compute_gm_imap_unordered() | gram_matrix = self._compute_gm_imap_unordered() | ||||
| elif self.parallel is None: | elif self.parallel is None: | ||||
| gram_matrix = self._compute_gm_series() | |||||
| graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) | |||||
| gram_matrix = self._compute_gm_series(graphs) | |||||
| else: | else: | ||||
| raise Exception('Parallel mode is not set correctly.') | raise Exception('Parallel mode is not set correctly.') | ||||
| @@ -496,11 +515,11 @@ class GraphKernel(BaseEstimator): #, ABC): | |||||
| return gram_matrix | return gram_matrix | ||||
| def _compute_gm_series(self): | |||||
| def _compute_gm_series(self, graphs): | |||||
| pass | pass | ||||
| def _compute_gm_imap_unordered(self): | |||||
| def _compute_gm_imap_unordered(self, graphs): | |||||
| pass | pass | ||||
| @@ -28,16 +28,16 @@ from gklearn.kernels import GraphKernel | |||||
| class Treelet(GraphKernel): | class Treelet(GraphKernel): | ||||
| def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): | |||||
| def __init__(self, **kwargs): | |||||
| """Initialise a treelet kernel. | """Initialise a treelet kernel. | ||||
| """ | """ | ||||
| super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) | |||||
| GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs}) | |||||
| self.node_labels = kwargs.get('node_labels', []) | self.node_labels = kwargs.get('node_labels', []) | ||||
| self.edge_labels = kwargs.get('edge_labels', []) | self.edge_labels = kwargs.get('edge_labels', []) | ||||
| self.sub_kernel = kwargs.get('sub_kernel', None) | self.sub_kernel = kwargs.get('sub_kernel', None) | ||||
| self.ds_infos = kwargs.get('ds_infos', {}) | self.ds_infos = kwargs.get('ds_infos', {}) | ||||
| self.precompute_canonkeys = precompute_canonkeys | |||||
| self.save_canonkeys = save_canonkeys | |||||
| self.precompute_canonkeys = kwargs.get('precompute_canonkeys', True) | |||||
| self.save_canonkeys = kwargs.get('save_canonkeys', True) | |||||
| ########################################################################## | ########################################################################## | ||||
| @@ -71,7 +71,7 @@ class Treelet(GraphKernel): | |||||
| raise ValueError('Sub-kernel not set.') | raise ValueError('Sub-kernel not set.') | ||||
| def _compute_kernel_matrix_series(self, Y): | |||||
| def _compute_kernel_matrix_series(self, Y, X=None, load_canonkeys=True): | |||||
| """Compute the kernel matrix between a given target graphs (Y) and | """Compute the kernel matrix between a given target graphs (Y) and | ||||
| the fitted graphs (X / self._graphs) without parallelization. | the fitted graphs (X / self._graphs) without parallelization. | ||||
| @@ -86,36 +86,45 @@ class Treelet(GraphKernel): | |||||
| The computed kernel matrix. | The computed kernel matrix. | ||||
| """ | """ | ||||
| if_comp_X_canonkeys = True | |||||
| # if load saved canonkeys of X from the instance: | |||||
| if load_canonkeys: | |||||
| # Canonical keys for self._graphs. | |||||
| try: | |||||
| check_is_fitted(self, ['_canonkeys']) | |||||
| canonkeys_list1 = self._canonkeys | |||||
| if_comp_X_canonkeys = False | |||||
| except NotFittedError: | |||||
| import warnings | |||||
| warnings.warn('The canonkeys of self._graphs are not computed/saved. The keys of `X` is computed instead.') | |||||
| if_comp_X_canonkeys = True | |||||
| # self._add_dummy_labels will modify the input in place. | |||||
| self._add_dummy_labels() # For self._graphs | |||||
| # Y = [g.copy() for g in Y] # @todo: ? | |||||
| self._add_dummy_labels(Y) | |||||
| # get all canonical keys of all graphs before computing kernels to save | # get all canonical keys of all graphs before computing kernels to save | ||||
| # time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
| # Canonical keys for self._graphs. | |||||
| try: | |||||
| check_is_fitted(self, ['_canonkeys']) | |||||
| canonkeys_list1 = self._canonkeys | |||||
| except NotFittedError: | |||||
| # Compute the canonical keys of X. | |||||
| if if_comp_X_canonkeys: | |||||
| if X is None: | |||||
| raise('X can not be None.') | |||||
| # self._add_dummy_labels will modify the input in place. | |||||
| self._add_dummy_labels(X) # for X | |||||
| canonkeys_list1 = [] | canonkeys_list1 = [] | ||||
| iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
| iterator = get_iters(self._graphs, desc='Getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
| for g in iterator: | for g in iterator: | ||||
| canonkeys_list1.append(self._get_canonkeys(g)) | canonkeys_list1.append(self._get_canonkeys(g)) | ||||
| if self.save_canonkeys: | |||||
| self._canonkeys = canonkeys_list1 | |||||
| # Canonical keys for Y. | # Canonical keys for Y. | ||||
| # Y = [g.copy() for g in Y] # @todo: ? | |||||
| self._add_dummy_labels(Y) | |||||
| canonkeys_list2 = [] | canonkeys_list2 = [] | ||||
| iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
| iterator = get_iters(Y, desc='Getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
| for g in iterator: | for g in iterator: | ||||
| canonkeys_list2.append(self._get_canonkeys(g)) | canonkeys_list2.append(self._get_canonkeys(g)) | ||||
| if self.save_canonkeys: | |||||
| self._Y_canonkeys = canonkeys_list2 | |||||
| # if self.save_canonkeys: | |||||
| # self._Y_canonkeys = canonkeys_list2 | |||||
| # compute kernel matrix. | # compute kernel matrix. | ||||
| kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) | kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) | ||||
| @@ -235,13 +244,13 @@ class Treelet(GraphKernel): | |||||
| ########################################################################## | ########################################################################## | ||||
| def _compute_gm_series(self): | |||||
| self._add_dummy_labels(self._graphs) | |||||
| def _compute_gm_series(self, graphs): | |||||
| self._add_dummy_labels(graphs) | |||||
| # get all canonical keys of all graphs before computing kernels to save | # get all canonical keys of all graphs before computing kernels to save | ||||
| # time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
| canonkeys = [] | canonkeys = [] | ||||
| iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, | |||||
| iterator = get_iters(graphs, desc='getting canonkeys', file=sys.stdout, | |||||
| verbose=(self.verbose >= 2)) | verbose=(self.verbose >= 2)) | ||||
| for g in iterator: | for g in iterator: | ||||
| canonkeys.append(self._get_canonkeys(g)) | canonkeys.append(self._get_canonkeys(g)) | ||||
| @@ -250,11 +259,11 @@ class Treelet(GraphKernel): | |||||
| self._canonkeys = canonkeys | self._canonkeys = canonkeys | ||||
| # compute Gram matrix. | # compute Gram matrix. | ||||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
| gram_matrix = np.zeros((len(graphs), len(graphs))) | |||||
| from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||||
| itr = combinations_with_replacement(range(0, len(graphs)), 2) | |||||
| len_itr = int(len(graphs) * (len(graphs) + 1) / 2) | |||||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | ||||
| length=len_itr, verbose=(self.verbose >= 2)) | length=len_itr, verbose=(self.verbose >= 2)) | ||||
| for i, j in iterator: | for i, j in iterator: | ||||
| @@ -390,6 +399,9 @@ class Treelet(GraphKernel): | |||||
| Treelet kernel between 2 graphs. | Treelet kernel between 2 graphs. | ||||
| """ | """ | ||||
| keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | ||||
| if len(keys) == 0: # There is nothing in common... | |||||
| return 0 | |||||
| vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | ||||
| vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | ||||
| @@ -28,7 +28,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| def __init__(self, **kwargs): | def __init__(self, **kwargs): | ||||
| GraphKernel.__init__(self) | |||||
| GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs}) | |||||
| self.node_labels = kwargs.get('node_labels', []) | self.node_labels = kwargs.get('node_labels', []) | ||||
| self.edge_labels = kwargs.get('edge_labels', []) | self.edge_labels = kwargs.get('edge_labels', []) | ||||
| self.height = int(kwargs.get('height', 0)) | self.height = int(kwargs.get('height', 0)) | ||||
| @@ -50,7 +50,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| ########################################################################## | ########################################################################## | ||||
| def _compute_gm_series(self): | |||||
| def _compute_gm_series(self, graphs): | |||||
| # if self.verbose >= 2: | # if self.verbose >= 2: | ||||
| # import warnings | # import warnings | ||||
| # warnings.warn('A part of the computation is parallelized.') | # warnings.warn('A part of the computation is parallelized.') | ||||
| @@ -59,19 +59,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| # for WL subtree kernel | # for WL subtree kernel | ||||
| if self._base_kernel == 'subtree': | if self._base_kernel == 'subtree': | ||||
| gram_matrix = self._subtree_kernel_do(self._graphs) | |||||
| gram_matrix = self._subtree_kernel_do(graphs) | |||||
| # for WL shortest path kernel | # for WL shortest path kernel | ||||
| elif self._base_kernel == 'sp': | elif self._base_kernel == 'sp': | ||||
| gram_matrix = self._sp_kernel_do(self._graphs) | |||||
| gram_matrix = self._sp_kernel_do(graphs) | |||||
| # for WL edge kernel | # for WL edge kernel | ||||
| elif self._base_kernel == 'edge': | elif self._base_kernel == 'edge': | ||||
| gram_matrix = self._edge_kernel_do(self._graphs) | |||||
| gram_matrix = self._edge_kernel_do(graphs) | |||||
| # for user defined base kernel | # for user defined base kernel | ||||
| else: | else: | ||||
| gram_matrix = self._user_kernel_do(self._graphs) | |||||
| gram_matrix = self._user_kernel_do(graphs) | |||||
| return gram_matrix | return gram_matrix | ||||
| @@ -204,70 +204,13 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| def pairwise_kernel(self, g1, g2): | def pairwise_kernel(self, g1, g2): | ||||
| Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | |||||
| kernel = 0 | |||||
| # initial for height = 0 | |||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||||
| # for each graph | |||||
| for G in Gn: | |||||
| # set all labels into a tuple. | |||||
| for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||||
| G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||||
| # get the set of original labels | |||||
| labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||||
| # number of occurence of each label in G | |||||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||||
| kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | |||||
| # iterate each height | |||||
| for h in range(1, self.height + 1): | |||||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
| # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||||
| all_num_of_each_label = [] # number of occurence of each label in G | |||||
| # @todo: parallel this part. | |||||
| for G in Gn: | |||||
| all_multisets = [] | |||||
| for node, attrs in G.nodes(data=True): | |||||
| # Multiset-label determination. | |||||
| multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = [attrs['lt']] + multiset # add the prefix | |||||
| all_multisets.append(tuple(multiset)) | |||||
| # label compression | |||||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
| # a dictionary mapping original labels to new ones. | |||||
| set_compressed = {} | |||||
| # if a label occured before, assign its former compressed label, | |||||
| # else assign the number of labels occured + 1 as the compressed label. | |||||
| for value in set_unique: | |||||
| if value in all_set_compressed.keys(): | |||||
| set_compressed[value] = all_set_compressed[value] | |||||
| else: | |||||
| set_compressed[value] = str(num_of_labels_occured + 1) | |||||
| num_of_labels_occured += 1 | |||||
| all_set_compressed.update(set_compressed) | |||||
| # relabel nodes | |||||
| for idx, node in enumerate(G.nodes()): | |||||
| G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
| # get the set of compressed labels | |||||
| labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||||
| # all_labels_ori.update(labels_comp) | |||||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
| # Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | |||||
| Gn = [g1, g2] | |||||
| # for WL subtree kernel | |||||
| if self._base_kernel == 'subtree': | |||||
| kernel = self._subtree_kernel_do(Gn, return_mat=False) | |||||
| # Compute subtree kernel with h iterations and add it to the final kernel | |||||
| kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | |||||
| # @todo: other subkernels. | |||||
| return kernel | return kernel | ||||
| @@ -291,7 +234,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| return kernel | return kernel | ||||
| def _subtree_kernel_do_nl(self, Gn): | |||||
| def _subtree_kernel_do_nl(self, Gn, return_mat=True): | |||||
| """Compute Weisfeiler-Lehman kernels between graphs with node labels. | """Compute Weisfeiler-Lehman kernels between graphs with node labels. | ||||
| Parameters | Parameters | ||||
| @@ -301,10 +244,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| Return | Return | ||||
| ------ | ------ | ||||
| gram_matrix : Numpy matrix | |||||
| kernel_matrix : Numpy matrix / float | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | ||||
| """ | """ | ||||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
| kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) | |||||
| gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) | |||||
| # initial for height = 0 | # initial for height = 0 | ||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | ||||
| @@ -324,7 +268,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | all_num_of_each_label.append(dict(Counter(labels_ori))) | ||||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | # Compute subtree kernel with the 0th iteration and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| # iterate each height | # iterate each height | ||||
| for h in range(1, self.height + 1): | for h in range(1, self.height + 1): | ||||
| @@ -342,12 +286,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel | # Compute subtree kernel with h iterations and add it to the final kernel | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| return gram_matrix | |||||
| return kernel_matrix | |||||
| def _subtree_kernel_do_el(self, Gn): | |||||
| def _subtree_kernel_do_el(self, Gn, return_mat=True): | |||||
| """Compute Weisfeiler-Lehman kernels between graphs with edge labels. | """Compute Weisfeiler-Lehman kernels between graphs with edge labels. | ||||
| Parameters | Parameters | ||||
| @@ -357,19 +301,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| Return | Return | ||||
| ------ | ------ | ||||
| gram_matrix : Numpy matrix | |||||
| kernel_matrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | ||||
| """ | """ | ||||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
| kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) | |||||
| gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) | |||||
| # initial for height = 0 | # initial for height = 0 | ||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | ||||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | # Compute subtree kernel with the 0th iteration and add it to the final kernel. | ||||
| iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||||
| for i, j in iterator: | |||||
| gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
| gram_matrix[j][i] = gram_matrix[i][j] | |||||
| iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2) | |||||
| for i, j in iterator: # @todo: not correct if return_mat == False. | |||||
| kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
| kernel_matrix[j][i] = kernel_matrix[i][j] | |||||
| # if h >= 1. | # if h >= 1. | ||||
| @@ -393,7 +338,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| # Iterate along heights (>= 2). | # Iterate along heights (>= 2). | ||||
| @@ -407,12 +352,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| return gram_matrix | |||||
| return kernel_matrix | |||||
| def _subtree_kernel_do_labeled(self, Gn): | |||||
| def _subtree_kernel_do_labeled(self, Gn, return_mat=True): | |||||
| """Compute Weisfeiler-Lehman kernels between graphs with both node and | """Compute Weisfeiler-Lehman kernels between graphs with both node and | ||||
| edge labels. | edge labels. | ||||
| @@ -423,10 +368,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| Return | Return | ||||
| ------ | ------ | ||||
| gram_matrix : Numpy matrix | |||||
| kernel_matrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | ||||
| """ | """ | ||||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
| kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) | |||||
| gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) | |||||
| # initial for height = 0 | # initial for height = 0 | ||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | ||||
| @@ -446,10 +392,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | all_num_of_each_label.append(dict(Counter(labels_ori))) | ||||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | # Compute subtree kernel with the 0th iteration and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| # if h >= 1. | |||||
| # if h >= 1: | |||||
| if self.height > 0: | if self.height > 0: | ||||
| # Set all edge labels into a tuple. # @todo: remove this original labels or not? | # Set all edge labels into a tuple. # @todo: remove this original labels or not? | ||||
| if self.verbose >= 2: | if self.verbose >= 2: | ||||
| @@ -470,7 +416,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| # Iterate along heights. | # Iterate along heights. | ||||
| @@ -484,12 +430,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| return gram_matrix | |||||
| return kernel_matrix | |||||
| def _subtree_kernel_do_unlabeled(self, Gn): | |||||
| def _subtree_kernel_do_unlabeled(self, Gn, return_mat=True): | |||||
| """Compute Weisfeiler-Lehman kernels between graphs without labels. | """Compute Weisfeiler-Lehman kernels between graphs without labels. | ||||
| Parameters | Parameters | ||||
| @@ -499,19 +445,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| Return | Return | ||||
| ------ | ------ | ||||
| gram_matrix : Numpy matrix | |||||
| kernel_matrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | ||||
| """ | """ | ||||
| gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
| kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) | |||||
| gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) | |||||
| # initial for height = 0 | # initial for height = 0 | ||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | ||||
| # Compute subtree kernel with the 0th iteration and add it to the final kernel. | # Compute subtree kernel with the 0th iteration and add it to the final kernel. | ||||
| iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||||
| for i, j in iterator: | |||||
| gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
| gram_matrix[j][i] = gram_matrix[i][j] | |||||
| iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2) | |||||
| for i, j in iterator: # @todo: not correct if return_mat == False. | |||||
| kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
| kernel_matrix[j][i] = kernel_matrix[i][j] | |||||
| # if h >= 1. | # if h >= 1. | ||||
| @@ -526,7 +473,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| # Iterate along heights (>= 2). | # Iterate along heights (>= 2). | ||||
| @@ -540,9 +487,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | ||||
| # Compute subtree kernel with h iterations and add it to the final kernel. | # Compute subtree kernel with h iterations and add it to the final kernel. | ||||
| self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
| kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) | |||||
| return gram_matrix | |||||
| return kernel_matrix | |||||
| def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | ||||
| @@ -717,6 +664,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
| all_num_of_each_label[j]) | all_num_of_each_label[j]) | ||||
| gram_matrix[j][i] = gram_matrix[i][j] | gram_matrix[j][i] = gram_matrix[i][j] | ||||
| return gram_matrix | |||||
| def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): | def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): | ||||
| """Compute the subtree kernel. | """Compute the subtree kernel. | ||||
| @@ -0,0 +1,24 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Fri Jun 24 14:25:57 2022 | |||||
| @author: ljia | |||||
| """ | |||||
| from ._split import BaseCrossValidatorWithValid | |||||
| # from ._split import BaseShuffleSplit | |||||
| from ._split import KFoldWithValid | |||||
| # from ._split import GroupKFold | |||||
| # from ._split import StratifiedKFoldWithValid | |||||
| # from ._split import TimeSeriesSplit | |||||
| # from ._split import LeaveOneGroupOut | |||||
| # from ._split import LeaveOneOut | |||||
| # from ._split import LeavePGroupsOut | |||||
| # from ._split import LeavePOut | |||||
| from ._split import RepeatedKFoldWithValid | |||||
| # from ._split import RepeatedStratifiedKFold | |||||
| # from ._split import ShuffleSplit | |||||
| # from ._split import GroupShuffleSplit | |||||
| # from ._split import StratifiedShuffleSplit | |||||
| # from ._split import StratifiedGroupKFold | |||||
| # from ._split import PredefinedSplit | |||||
| @@ -0,0 +1,287 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Fri Jun 24 11:13:26 2022 | |||||
| @author: ljia | |||||
| Reference: scikit-learn. | |||||
| """ | |||||
| from abc import abstractmethod | |||||
| import numbers | |||||
| import warnings | |||||
| import numpy as np | |||||
| from sklearn.utils import check_random_state, check_array, column_or_1d, indexable | |||||
| from sklearn.utils.validation import _num_samples | |||||
| from sklearn.utils.multiclass import type_of_target | |||||
| class BaseCrossValidatorWithValid(object): | |||||
| """Base class for all cross-validators. | |||||
| Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`. | |||||
| """ | |||||
| def split(self, X, y=None, groups=None): | |||||
| """Generate indices to split data into training, valid, and test set. | |||||
| Parameters | |||||
| ---------- | |||||
| X : array-like of shape (n_samples, n_features) | |||||
| Training data, where `n_samples` is the number of samples | |||||
| and `n_features` is the number of features. | |||||
| y : array-like of shape (n_samples,) | |||||
| The target variable for supervised learning problems. | |||||
| groups : array-like of shape (n_samples,), default=None | |||||
| Group labels for the samples used while splitting the dataset into | |||||
| train/test set. | |||||
| Yields | |||||
| ------ | |||||
| train : ndarray | |||||
| The training set indices for that split. | |||||
| valid : ndarray | |||||
| The valid set indices for that split. | |||||
| test : ndarray | |||||
| The testing set indices for that split. | |||||
| """ | |||||
| X, y, groups = indexable(X, y, groups) | |||||
| indices = np.arange(_num_samples(X)) | |||||
| for valid_index, test_index in self._iter_valid_test_masks(X, y, groups): | |||||
| train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))] | |||||
| valid_index = indices[valid_index] | |||||
| test_index = indices[test_index] | |||||
| yield train_index, valid_index, test_index | |||||
| # Since subclasses must implement either _iter_valid_test_masks or | |||||
| # _iter_valid_test_indices, neither can be abstract. | |||||
| def _iter_valid_test_masks(self, X=None, y=None, groups=None): | |||||
| """Generates boolean masks corresponding to valid and test sets. | |||||
| By default, delegates to _iter_valid_test_indices(X, y, groups) | |||||
| """ | |||||
| for valid_index, test_index in self._iter_valid_test_indices(X, y, groups): | |||||
| valid_mask = np.zeros(_num_samples(X), dtype=bool) | |||||
| test_mask = np.zeros(_num_samples(X), dtype=bool) | |||||
| valid_mask[valid_index] = True | |||||
| test_mask[test_index] = True | |||||
| yield valid_mask, test_mask | |||||
| def _iter_valid_test_indices(self, X=None, y=None, groups=None): | |||||
| """Generates integer indices corresponding to valid and test sets.""" | |||||
| raise NotImplementedError | |||||
| @abstractmethod | |||||
| def get_n_splits(self, X=None, y=None, groups=None): | |||||
| """Returns the number of splitting iterations in the cross-validator""" | |||||
| def __repr__(self): | |||||
| return _build_repr(self) | |||||
| class _BaseKFoldWithValid(BaseCrossValidatorWithValid): | |||||
| """Base class for KFoldWithValid, GroupKFoldWithValid, and StratifiedKFoldWithValid""" | |||||
| @abstractmethod | |||||
| def __init__(self, n_splits, *, stratify, shuffle, random_state): | |||||
| if not isinstance(n_splits, numbers.Integral): | |||||
| raise ValueError( | |||||
| 'The number of folds must be of Integral type. ' | |||||
| '%s of type %s was passed.' % (n_splits, type(n_splits)) | |||||
| ) | |||||
| n_splits = int(n_splits) | |||||
| if n_splits <= 2: | |||||
| raise ValueError( | |||||
| 'k-fold cross-validation requires at least one' | |||||
| ' train/valid/test split by setting n_splits=3 or more,' | |||||
| ' got n_splits={0}.'.format(n_splits) | |||||
| ) | |||||
| if not isinstance(shuffle, bool): | |||||
| raise TypeError('shuffle must be True or False; got {0}'.format(shuffle)) | |||||
| if not shuffle and random_state is not None: # None is the default | |||||
| raise ValueError( | |||||
| 'Setting a random_state has no effect since shuffle is ' | |||||
| 'False. You should leave ' | |||||
| 'random_state to its default (None), or set shuffle=True.', | |||||
| ) | |||||
| self.n_splits = n_splits | |||||
| self.stratify = stratify | |||||
| self.shuffle = shuffle | |||||
| self.random_state = random_state | |||||
| def split(self, X, y=None, groups=None): | |||||
| """Generate indices to split data into training, valid and test set.""" | |||||
| X, y, groups = indexable(X, y, groups) | |||||
| n_samples = _num_samples(X) | |||||
| if self.n_splits > n_samples: | |||||
| raise ValueError( | |||||
| ( | |||||
| 'Cannot have number of splits n_splits={0} greater' | |||||
| ' than the number of samples: n_samples={1}.' | |||||
| ).format(self.n_splits, n_samples) | |||||
| ) | |||||
| for train, valid, test in super().split(X, y, groups): | |||||
| yield train, valid, test | |||||
| class KFoldWithValid(_BaseKFoldWithValid): | |||||
| def __init__( | |||||
| self, | |||||
| n_splits=5, | |||||
| *, | |||||
| stratify=False, | |||||
| shuffle=False, | |||||
| random_state=None | |||||
| ): | |||||
| super().__init__( | |||||
| n_splits=n_splits, | |||||
| stratify=stratify, | |||||
| shuffle=shuffle, | |||||
| random_state=random_state | |||||
| ) | |||||
| def _make_valid_test_folds(self, X, y=None): | |||||
| rng = check_random_state(self.random_state) | |||||
| y = np.asarray(y) | |||||
| type_of_target_y = type_of_target(y) | |||||
| allowed_target_types = ('binary', 'multiclass') | |||||
| if type_of_target_y not in allowed_target_types: | |||||
| raise ValueError( | |||||
| 'Supported target types are: {}. Got {!r} instead.'.format( | |||||
| allowed_target_types, type_of_target_y | |||||
| ) | |||||
| ) | |||||
| y = column_or_1d(y) | |||||
| _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True) | |||||
| # y_inv encodes y according to lexicographic order. We invert y_idx to | |||||
| # map the classes so that they are encoded by order of appearance: | |||||
| # 0 represents the first label appearing in y, 1 the second, etc. | |||||
| _, class_perm = np.unique(y_idx, return_inverse=True) | |||||
| y_encoded = class_perm[y_inv] | |||||
| n_classes = len(y_idx) | |||||
| y_counts = np.bincount(y_encoded) | |||||
| min_groups = np.min(y_counts) | |||||
| if np.all(self.n_splits > y_counts): | |||||
| raise ValueError( | |||||
| "n_splits=%d cannot be greater than the" | |||||
| " number of members in each class." % (self.n_splits) | |||||
| ) | |||||
| if self.n_splits > min_groups: | |||||
| warnings.warn( | |||||
| "The least populated class in y has only %d" | |||||
| " members, which is less than n_splits=%d." | |||||
| % (min_groups, self.n_splits), | |||||
| UserWarning, | |||||
| ) | |||||
| # Determine the optimal number of samples from each class in each fold, | |||||
| # using round robin over the sorted y. (This can be done direct from | |||||
| # counts, but that code is unreadable.) | |||||
| y_order = np.sort(y_encoded) | |||||
| allocation = np.asarray( | |||||
| [ | |||||
| np.bincount(y_order[i :: self.n_splits], minlength=n_classes) | |||||
| for i in range(self.n_splits) | |||||
| ] | |||||
| ) | |||||
| # To maintain the data order dependencies as best as possible within | |||||
| # the stratification constraint, we assign samples from each class in | |||||
| # blocks (and then mess that up when shuffle=True). | |||||
| test_folds = np.empty(len(y), dtype='i') | |||||
| for k in range(n_classes): | |||||
| # since the kth column of allocation stores the number of samples | |||||
| # of class k in each test set, this generates blocks of fold | |||||
| # indices corresponding to the allocation for class k. | |||||
| folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k]) | |||||
| if self.shuffle: | |||||
| rng.shuffle(folds_for_class) | |||||
| test_folds[y_encoded == k] = folds_for_class | |||||
| return test_folds | |||||
| def _iter_valid_test_masks(self, X, y=None, groups=None): | |||||
| test_folds = self._make_valid_test_folds(X, y) | |||||
| for i in range(self.n_splits): | |||||
| if i + 1 < self.n_splits: | |||||
| j = i + 1 | |||||
| else: | |||||
| j = 0 | |||||
| yield test_folds == i, test_folds == j | |||||
| def split(self, X, y, groups=None): | |||||
| y = check_array(y, input_name='y', ensure_2d=False, dtype=None) | |||||
| return super().split(X, y, groups) | |||||
| class _RepeatedSplitsWithValid(object): | |||||
| def __init__( | |||||
| self, | |||||
| cv, | |||||
| *, | |||||
| n_repeats=10, | |||||
| random_state=None, | |||||
| **cvargs | |||||
| ): | |||||
| if not isinstance(n_repeats, int): | |||||
| raise ValueError('Number of repetitions must be of integer type.') | |||||
| if n_repeats <= 0: | |||||
| raise ValueError('Number of repetitions must be greater than 0.') | |||||
| self.cv = cv | |||||
| self.n_repeats = n_repeats | |||||
| self.random_state = random_state | |||||
| self.cvargs = cvargs | |||||
| def split(self, X, y=None, groups=None): | |||||
| n_repeats = self.n_repeats | |||||
| rng = check_random_state(self.random_state) | |||||
| for idx in range(n_repeats): | |||||
| cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) | |||||
| for train_index, valid_index, test_index in cv.split(X, y, groups): | |||||
| yield train_index, valid_index, test_index | |||||
| class RepeatedKFoldWithValid(_RepeatedSplitsWithValid): | |||||
| def __init__( | |||||
| self, | |||||
| *, | |||||
| n_splits=5, | |||||
| n_repeats=10, | |||||
| stratify=False, | |||||
| random_state=None | |||||
| ): | |||||
| super().__init__( | |||||
| KFoldWithValid, | |||||
| n_repeats=n_repeats, | |||||
| stratify=stratify, | |||||
| random_state=random_state, | |||||
| n_splits=n_splits, | |||||
| ) | |||||
| @@ -4,7 +4,7 @@ These kernels are defined between pairs of vectors. | |||||
| import numpy as np | import numpy as np | ||||
| def delta_kernel(x, y): | |||||
| def kronecker_delta_kernel(x, y): | |||||
| """Delta kernel. Return 1 if x == y, 0 otherwise. | """Delta kernel. Return 1 if x == y, 0 otherwise. | ||||
| Parameters | Parameters | ||||
| @@ -23,6 +23,10 @@ def delta_kernel(x, y): | |||||
| labeled graphs. In Proceedings of the 20th International Conference on | labeled graphs. In Proceedings of the 20th International Conference on | ||||
| Machine Learning, Washington, DC, United States, 2003. | Machine Learning, Washington, DC, United States, 2003. | ||||
| """ | """ | ||||
| return (1 if np.array_equal(x, y) else 0) | |||||
| def delta_kernel(x, y): | |||||
| return x == y #(1 if condition else 0) | return x == y #(1 if condition else 0) | ||||
| @@ -64,6 +68,11 @@ def gaussian_kernel(x, y, gamma=None): | |||||
| return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) | return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) | ||||
| def tanimoto_kernel(x, y): | |||||
| xy = np.dot(x, y) | |||||
| return xy / (np.dot(x, x) + np.dot(y, y) - xy) | |||||
| def gaussiankernel(x, y, gamma=None): | def gaussiankernel(x, y, gamma=None): | ||||
| return gaussian_kernel(x, y, gamma=gamma) | return gaussian_kernel(x, y, gamma=gamma) | ||||
| @@ -123,7 +132,7 @@ def linearkernel(x, y): | |||||
| def cosine_kernel(x, y): | def cosine_kernel(x, y): | ||||
| return np.dot(x, y) / (np.abs(x) * np.abs(y)) | |||||
| return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) | |||||
| def sigmoid_kernel(x, y, gamma=None, coef0=1): | def sigmoid_kernel(x, y, gamma=None, coef0=1): | ||||
| @@ -142,7 +151,7 @@ def laplacian_kernel(x, y, gamma=None): | |||||
| if gamma is None: | if gamma is None: | ||||
| gamma = 1.0 / len(x) | gamma = 1.0 / len(x) | ||||
| k = -gamma * np.abs(np.subtract(x, y)) | |||||
| k = -gamma * np.linalg.norm(np.subtract(x, y)) | |||||
| k = np.exp(k) | k = np.exp(k) | ||||
| return k | return k | ||||
| @@ -7,6 +7,9 @@ from enum import Enum, unique | |||||
| # from tqdm import tqdm | # from tqdm import tqdm | ||||
| #%% | |||||
| def getSPLengths(G1): | def getSPLengths(G1): | ||||
| sp = nx.shortest_path(G1) | sp = nx.shortest_path(G1) | ||||
| distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | ||||
| @@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels): | |||||
| return gt | return gt | ||||
| def graph_deepcopy(G): | |||||
| """Deep copy a graph, including deep copy of all nodes, edges and | |||||
| attributes of the graph, nodes and edges. | |||||
| def find_paths(G, source_node, length): | |||||
| """Find all paths with a certain length those start from a source node. | |||||
| A recursive depth first search is applied. | |||||
| Note | |||||
| ---- | |||||
| It is the same as the NetworkX function graph.copy(), as far as I know. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all paths start. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | """ | ||||
| # add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_copy = nx.DiGraph(**labels) | |||||
| else: | |||||
| G_copy = nx.Graph(**labels) | |||||
| if length == 0: | |||||
| return [[source_node]] | |||||
| path = [[source_node] + path for neighbor in G[source_node] \ | |||||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
| return path | |||||
| # add nodes | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_node(nd, **labels) | |||||
| # add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_edge(nd1, nd2, **labels) | |||||
| def find_all_paths(G, length, is_directed): | |||||
| """Find all paths with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| return G_copy | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| all_paths = [] | |||||
| for node in G: | |||||
| all_paths.extend(find_paths(G, node, length)) | |||||
| def graph_isIdentical(G1, G2): | |||||
| """Check if two graphs are identical, including: same nodes, edges, node | |||||
| labels/attributes, edge labels/attributes. | |||||
| if not is_directed: | |||||
| # For each path, two presentations are retrieved from its two extremities. | |||||
| # Remove one of them. | |||||
| all_paths_r = [path[::-1] for path in all_paths] | |||||
| for idx, path in enumerate(all_paths[:-1]): | |||||
| for path2 in all_paths_r[idx+1::]: | |||||
| if path == path2: | |||||
| all_paths[idx] = [] | |||||
| break | |||||
| all_paths = list(filter(lambda a: a != [], all_paths)) | |||||
| Notes | |||||
| ----- | |||||
| 1. The type of graphs has to be the same. | |||||
| return all_paths | |||||
| 2. Global/Graph attributes are neglected as they may contain names for graphs. | |||||
| """ | |||||
| # check nodes. | |||||
| nlist1 = [n for n in G1.nodes(data=True)] | |||||
| nlist2 = [n for n in G2.nodes(data=True)] | |||||
| if not nlist1 == nlist2: | |||||
| return False | |||||
| # check edges. | |||||
| elist1 = [n for n in G1.edges(data=True)] | |||||
| elist2 = [n for n in G2.edges(data=True)] | |||||
| if not elist1 == elist2: | |||||
| return False | |||||
| # check graph attributes. | |||||
| return True | |||||
| # @todo: use it in ShortestPath. | |||||
| def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): | |||||
| """Compute kernels between each pair of vertices in two graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| g1, g2 : NetworkX graph | |||||
| The kernels bewteen pairs of vertices in these two graphs are computed. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns a number | |||||
| as the kernel value. Ignored when nodes are unlabeled. This argument | |||||
| is designated to conjugate gradient method and fixed-point iterations. | |||||
| node_labels : list, optional | |||||
| The list of the name strings of the node labels. The default is []. | |||||
| node_attrs : list, optional | |||||
| The list of the name strings of the node attributes. The default is []. | |||||
| def get_node_labels(Gn, node_label): | |||||
| """Get node labels of dataset Gn. | |||||
| """ | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| Returns | |||||
| ------- | |||||
| vk_dict : dict | |||||
| Vertex kernels keyed by vertices. | |||||
| Notes | |||||
| ----- | |||||
| This function is used by ``gklearn.kernels.FixedPoint'' and | |||||
| ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| """Get edge labels of dataset Gn. | |||||
| References | |||||
| ---------- | |||||
| .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. | |||||
| Parallelization of shortest path graph kernels on multi-core cpus and gpus. | |||||
| Proceedings of the Programmability Issues for Heterogeneous Multicores | |||||
| (MultiProg), Vienna, Austria, 2014. | |||||
| """ | """ | ||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| # node unlabeled | |||||
| else: | |||||
| pass # @todo: add edge weights. | |||||
| # for e1 in g1.edges(data=True): | |||||
| # for e2 in g2.edges(data=True): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| return vk_dict | |||||
| #%% | |||||
| def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | ||||
| @@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d | |||||
| print('\ncomplete.') | print('\ncomplete.') | ||||
| def find_paths(G, source_node, length): | |||||
| """Find all paths with a certain length those start from a source node. | |||||
| A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all paths start. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| if length == 0: | |||||
| return [[source_node]] | |||||
| path = [[source_node] + path for neighbor in G[source_node] \ | |||||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
| return path | |||||
| def find_all_paths(G, length, is_directed): | |||||
| """Find all paths with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| all_paths = [] | |||||
| for node in G: | |||||
| all_paths.extend(find_paths(G, node, length)) | |||||
| if not is_directed: | |||||
| # For each path, two presentations are retrieved from its two extremities. | |||||
| # Remove one of them. | |||||
| all_paths_r = [path[::-1] for path in all_paths] | |||||
| for idx, path in enumerate(all_paths[:-1]): | |||||
| for path2 in all_paths_r[idx+1::]: | |||||
| if path == path2: | |||||
| all_paths[idx] = [] | |||||
| break | |||||
| all_paths = list(filter(lambda a: a != [], all_paths)) | |||||
| return all_paths | |||||
| def get_mlti_dim_node_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def get_mlti_dim_edge_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for ed, attrs in G.edges(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def normalize_gram_matrix(gram_matrix): | def normalize_gram_matrix(gram_matrix): | ||||
| diag = gram_matrix.diagonal().copy() | diag = gram_matrix.diagonal().copy() | ||||
| old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | ||||
| @@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix): | |||||
| return dis_mat, dis_max, dis_min, dis_mean | return dis_mat, dis_max, dis_min, dis_mean | ||||
| # @todo: use it in ShortestPath. | |||||
| def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): | |||||
| """Compute kernels between each pair of vertices in two graphs. | |||||
| #%% | |||||
| def graph_deepcopy(G): | |||||
| """Deep copy a graph, including deep copy of all nodes, edges and | |||||
| attributes of the graph, nodes and edges. | |||||
| Note | |||||
| ---- | |||||
| - It is the same as the NetworkX function graph.copy(), as far as I know. | |||||
| - This function only supports Networkx.Graph and Networkx.DiGraph. | |||||
| """ | |||||
| # add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_copy = nx.DiGraph(**labels) | |||||
| else: | |||||
| G_copy = nx.Graph(**labels) | |||||
| # add nodes | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_node(nd, **labels) | |||||
| # add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_edge(nd1, nd2, **labels) | |||||
| return G_copy | |||||
| def graph_isIdentical(G1, G2): | |||||
| """Check if two graphs are identical, including: same nodes, edges, node | |||||
| labels/attributes, edge labels/attributes. | |||||
| Notes | |||||
| ----- | |||||
| 1. The type of graphs has to be the same. | |||||
| 2. Global/Graph attributes are neglected as they may contain names for graphs. | |||||
| """ | |||||
| # check nodes. | |||||
| nlist1 = [n for n in G1.nodes(data=True)] | |||||
| nlist2 = [n for n in G2.nodes(data=True)] | |||||
| if not nlist1 == nlist2: | |||||
| return False | |||||
| # check edges. | |||||
| elist1 = [n for n in G1.edges(data=True)] | |||||
| elist2 = [n for n in G2.edges(data=True)] | |||||
| if not elist1 == elist2: | |||||
| return False | |||||
| # check graph attributes. | |||||
| return True | |||||
| def get_node_labels(Gn, node_label): | |||||
| """Get node labels of dataset Gn. | |||||
| """ | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| """Get edge labels of dataset Gn. | |||||
| """ | |||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||
| def get_mlti_dim_node_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def get_mlti_dim_edge_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for ed, attrs in G.edges(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def nx_permute_nodes(G, random_state=None): | |||||
| """Permute node indices in a NetworkX graph. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| g1, g2 : NetworkX graph | |||||
| The kernels bewteen pairs of vertices in these two graphs are computed. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns a number | |||||
| as the kernel value. Ignored when nodes are unlabeled. This argument | |||||
| is designated to conjugate gradient method and fixed-point iterations. | |||||
| node_labels : list, optional | |||||
| The list of the name strings of the node labels. The default is []. | |||||
| node_attrs : list, optional | |||||
| The list of the name strings of the node attributes. The default is []. | |||||
| G : TYPE | |||||
| DESCRIPTION. | |||||
| random_state : TYPE, optional | |||||
| DESCRIPTION. The default is None. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| vk_dict : dict | |||||
| Vertex kernels keyed by vertices. | |||||
| G_new : TYPE | |||||
| DESCRIPTION. | |||||
| Notes | Notes | ||||
| ----- | ----- | ||||
| This function is used by ``gklearn.kernels.FixedPoint'' and | |||||
| ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. | |||||
| References | |||||
| ---------- | |||||
| .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. | |||||
| Parallelization of shortest path graph kernels on multi-core cpus and gpus. | |||||
| Proceedings of the Programmability Issues for Heterogeneous Multicores | |||||
| (MultiProg), Vienna, Austria, 2014. | |||||
| - This function only supports Networkx.Graph and Networkx.DiGraph. | |||||
| """ | """ | ||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| # @todo: relabel node with integers? (in case something went wrong...) | |||||
| # Add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_new = nx.DiGraph(**labels) | |||||
| else: | else: | ||||
| # node non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| # node unlabeled | |||||
| else: | |||||
| pass # @todo: add edge weights. | |||||
| # for e1 in g1.edges(data=True): | |||||
| # for e2 in g2.edges(data=True): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| G_new = nx.Graph(**labels) | |||||
| return vk_dict | |||||
| # Create a random mapping old node indices <-> new indices. | |||||
| nb_nodes = nx.number_of_nodes(G) | |||||
| indices_orig = range(nb_nodes) | |||||
| idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig) | |||||
| # Add nodes. | |||||
| nodes_orig = list(G.nodes) | |||||
| for i_orig in range(nb_nodes): | |||||
| i_new = idx_mapping[i_orig] | |||||
| labels = {} | |||||
| for k, v in G.nodes[nodes_orig[i_new]].items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_new.add_node(nodes_orig[i_new], **labels) | |||||
| # Add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_new.add_edge(nd1, nd2, **labels) | |||||
| # # create a random mapping old label -> new label | |||||
| # node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes()))) | |||||
| # # build a new graph | |||||
| # G_new = nx.relabel_nodes(G, node_mapping) | |||||
| return G_new | |||||
| #%% | |||||
| def dummy_node(): | def dummy_node(): | ||||
| @@ -2,7 +2,7 @@ numpy>=1.16.2 | |||||
| scipy>=1.1.0 | scipy>=1.1.0 | ||||
| matplotlib>=3.1.0 | matplotlib>=3.1.0 | ||||
| networkx>=2.2 | networkx>=2.2 | ||||
| scikit-learn>=0.20.0 | |||||
| scikit-learn>=1.1.0 | |||||
| tabulate>=0.8.2 | tabulate>=0.8.2 | ||||
| tqdm>=4.26.0 | tqdm>=4.26.0 | ||||
| control>=0.8.2 # for generalized random walk kernels only. | control>=0.8.2 # for generalized random walk kernels only. | ||||
| @@ -1,8 +1,8 @@ | |||||
| numpy>=1.16.2 | numpy>=1.16.2 | ||||
| scipy>=1.1.0 | scipy>=1.1.0 | ||||
| matplotlib>=3.0.0 | |||||
| matplotlib>=3.1.0 | |||||
| networkx>=2.2 | networkx>=2.2 | ||||
| scikit-learn>=0.20.0 | |||||
| scikit-learn>=1.1.0 | |||||
| tabulate>=0.8.2 | tabulate>=0.8.2 | ||||
| tqdm>=4.26.0 | tqdm>=4.26.0 | ||||
| control>=0.8.2 # for generalized random walk kernels only. | control>=0.8.2 # for generalized random walk kernels only. | ||||