| @@ -1,147 +0,0 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Nov 2 16:17:01 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| from group_results import group_trials | |||||
| def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| # Return if the file exists. | |||||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
| return None, None | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': max_num_solutions, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||||
| # Return if the group file exists. | |||||
| name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
| if os.path.isfile(name_group): | |||||
| return | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| # Group trials and Remove single files. | |||||
| name_prefix = 'ged_matrix' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False) | |||||
| name_prefix = 'runtime' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for max_num_solutions in mnum_solutions_list: | |||||
| print() | |||||
| print('Max # of solutions:', max_num_solutions) | |||||
| for ratio in ratio_list: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||||
| def get_param_lists(ds_name): | |||||
| if ds_name == 'AIDS_symb': | |||||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| else: | |||||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| return mnum_solutions_list, ratio_list | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| mnum_solutions_list, ratio_list = get_param_lists(ds_name) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -13,7 +13,7 @@ import pickle | |||||
| import logging | import logging | ||||
| from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
| import time | import time | ||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation | |||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids | |||||
| import sys | import sys | ||||
| from group_results import group_trials, check_group_existence, update_group_marker | from group_results import group_trials, check_group_existence, update_group_marker | ||||
| @@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'): | |||||
| elif mode == 'simple': | elif mode == 'simple': | ||||
| from sklearn.model_selection import ParameterGrid | from sklearn.model_selection import ParameterGrid | ||||
| param_grid = ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) | |||||
| param_grid = mix_param_grids([list(ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), | |||||
| list(ParameterGrid([ | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) | |||||
| # print(list(param_grid)) | # print(list(param_grid)) | ||||
| if ds_name == 'AIDS_symb': | if ds_name == 'AIDS_symb': | ||||
| @@ -148,7 +149,7 @@ if __name__ == '__main__': | |||||
| # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | ||||
| # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
| save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | |||||
| save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/' | |||||
| os.makedirs(save_dir, exist_ok=True) | os.makedirs(save_dir, exist_ok=True) | ||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | os.makedirs(save_dir + 'groups/', exist_ok=True) | ||||
| @@ -0,0 +1,172 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Nov 2 16:17:01 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import time | |||||
| from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids | |||||
| import sys | |||||
| from group_results import group_trials, check_group_existence, update_group_marker | |||||
| def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| # Return if the file exists. | |||||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
| return None, None | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': 1, # @ max_num_solutions, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = set_edit_cost_consts(ratio, | |||||
| node_labeled=len(dataset.node_labels), | |||||
| edge_labeled=len(dataset.edge_labels), | |||||
| mode='uniform') | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, | |||||
| options=options, | |||||
| repeats=num_solutions, | |||||
| permute_nodes=True, | |||||
| random_state=None, | |||||
| parallel=parallel, | |||||
| verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
| # Return if the group file exists. | |||||
| name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
| if check_group_existence(name_group): | |||||
| return | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| num_trials = 100 | |||||
| for trial in range(1, num_trials + 1): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| # Group trials and remove single files. | |||||
| # @todo: if the program stops between the following lines, then there may be errors. | |||||
| name_prefix = 'ged_matrix' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||||
| name_prefix = 'runtime' + name_middle | |||||
| group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||||
| update_group_marker(name_group) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for params in list(param_grid): | |||||
| print() | |||||
| print(params) | |||||
| save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) | |||||
| def get_param_lists(ds_name, mode='test'): | |||||
| if mode == 'test': | |||||
| num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] | |||||
| ratio_list = [10] | |||||
| return num_solutions_list, ratio_list | |||||
| elif mode == 'simple': | |||||
| from sklearn.model_selection import ParameterGrid | |||||
| param_grid = mix_param_grids([list(ParameterGrid([ | |||||
| {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), | |||||
| list(ParameterGrid([ | |||||
| {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) | |||||
| # print(list(param_grid)) | |||||
| if ds_name == 'AIDS_symb': | |||||
| num_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
| else: | |||||
| num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | |||||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | |||||
| return param_grid | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||||
| # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | |||||
| # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| param_grid = get_param_lists(ds_name, mode='simple') | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -32,6 +32,7 @@ def check_group_existence(file_name): | |||||
| def update_group_marker(file_name): | def update_group_marker(file_name): | ||||
| # @todo: possible error when seveal tasks are using this file at the same time. | |||||
| path, name = os.path.split(file_name) | path, name = os.path.split(file_name) | ||||
| marker_fn = os.path.join(path, 'group_names_finished.pkl') | marker_fn = os.path.join(path, 'group_names_finished.pkl') | ||||
| if os.path.isfile(marker_fn): | if os.path.isfile(marker_fn): | ||||
| @@ -9,36 +9,45 @@ import os | |||||
| import re | import re | ||||
| cur_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| def get_job_script(arg): | def get_job_script(arg): | ||||
| script = r""" | script = r""" | ||||
| #!/bin/bash | #!/bin/bash | ||||
| #SBATCH --exclusive | #SBATCH --exclusive | ||||
| #SBATCH --job-name="st.""" + arg + r""".bp" | #SBATCH --job-name="st.""" + arg + r""".bp" | ||||
| #SBATCH --partition=tlong | |||||
| #SBATCH --partition=court | |||||
| #SBATCH --mail-type=ALL | #SBATCH --mail-type=ALL | ||||
| #SBATCH --mail-user=jajupmochi@gmail.com | #SBATCH --mail-user=jajupmochi@gmail.com | ||||
| #SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| #SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" | |||||
| # | # | ||||
| #SBATCH --ntasks=1 | #SBATCH --ntasks=1 | ||||
| #SBATCH --nodes=1 | #SBATCH --nodes=1 | ||||
| #SBATCH --cpus-per-task=1 | #SBATCH --cpus-per-task=1 | ||||
| #SBATCH --time=300:00:00 | |||||
| #SBATCH --time=48:00:00 | |||||
| #SBATCH --mem-per-cpu=4000 | #SBATCH --mem-per-cpu=4000 | ||||
| srun hostname | srun hostname | ||||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
| srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg | |||||
| cd """ + cur_path + r""" | |||||
| echo Working directory : $PWD | |||||
| echo Local work dir : $LOCAL_WORK_DIR | |||||
| python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg | |||||
| script = script.strip() | script = script.strip() | ||||
| script = re.sub('\n\t+', '\n', script) | script = re.sub('\n\t+', '\n', script) | ||||
| script = re.sub('\n +', '\n', script) | script = re.sub('\n +', '\n', script) | ||||
| return script | return script | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||||
| os.makedirs('outputs/', exist_ok=True) | |||||
| os.makedirs('errors/', exist_ok=True) | |||||
| ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: | |||||
| job_script = get_job_script(ds_name) | job_script = get_job_script(ds_name) | ||||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | command = 'sbatch <<EOF\n' + job_script + '\nEOF' | ||||
| # print(command) | # print(command) | ||||
| @@ -325,6 +325,22 @@ def dichotomous_permutation(arr, layer=0): | |||||
| # return new_arr | # return new_arr | ||||
| def mix_param_grids(list_of_grids): | |||||
| mixed_grids = [] | |||||
| not_finished = [True] * len(list_of_grids) | |||||
| idx = 0 | |||||
| while sum(not_finished) > 0: | |||||
| for g_idx, grid in enumerate(list_of_grids): | |||||
| if idx < len(grid): | |||||
| mixed_grids.append(grid[idx]) | |||||
| else: | |||||
| not_finished[g_idx] = False | |||||
| idx += 1 | |||||
| return mixed_grids | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| root_dir = 'outputs/CRIANN/' | root_dir = 'outputs/CRIANN/' | ||||
| # for dir_ in sorted(os.listdir(root_dir)): | # for dir_ in sorted(os.listdir(root_dir)): | ||||
| @@ -337,4 +353,4 @@ if __name__ == '__main__': | |||||
| # get_relative_errors(save_dir) | # get_relative_errors(save_dir) | ||||
| # except Exception as exp: | # except Exception as exp: | ||||
| # print('An exception occured when running this experiment:') | # print('An exception occured when running this experiment:') | ||||
| # print(repr(exp)) | |||||
| # print(repr(exp)) | |||||
| @@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo | |||||
| g = listID[0] | g = listID[0] | ||||
| h = listID[1] | h = listID[1] | ||||
| dis_min = np.inf | dis_min = np.inf | ||||
| # print('------------------------------------------') | |||||
| for i in range(0, repeats): | for i in range(0, repeats): | ||||
| ged_env.run_method(g, h) | ged_env.run_method(g, h) | ||||
| upper = ged_env.get_upper_bound(g, h) | upper = ged_env.get_upper_bound(g, h) | ||||
| dis = upper | dis = upper | ||||
| # print(dis) | |||||
| if dis < dis_min: | if dis < dis_min: | ||||
| dis_min = dis | dis_min = dis | ||||
| pi_forward = ged_env.get_forward_map(g, h) | pi_forward = ged_env.get_forward_map(g, h) | ||||
| @@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||||
| return ged_vec, ged_mat, n_edit_operations | return ged_vec, ged_mat, n_edit_operations | ||||
| def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): | |||||
| #%% | |||||
| def compute_geds(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| permute_nodes=False, | |||||
| random_state=None, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Compute graph edit distance matrix using GEDLIB. | |||||
| """ | |||||
| if permute_nodes: | |||||
| return _compute_geds_with_permutation(graphs, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=repeats, | |||||
| random_state=random_state, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| else: | |||||
| return _compute_geds_without_permutation(graphs, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=repeats, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| #%% | |||||
| def _compute_geds_with_permutation(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| random_state=None, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| from gklearn.utils.utils import nx_permute_nodes | |||||
| # Initialze variables. | |||||
| ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf) | |||||
| np.fill_diagonal(ged_mat_optim, 0) | |||||
| len_itr = int(len(graphs) * (len(graphs) - 1) / 2) | |||||
| ged_vec = [0] * len_itr | |||||
| n_edit_operations = [0] * len_itr | |||||
| # for each repeats: | |||||
| for i in range(0, repeats): | |||||
| # Permutate nodes. | |||||
| graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs] | |||||
| out = _compute_geds_without_permutation(graphs_pmut, | |||||
| options=options, | |||||
| sort=sort, | |||||
| repeats=1, | |||||
| parallel=parallel, | |||||
| n_jobs=n_jobs, | |||||
| verbose=verbose) | |||||
| # Compare current results with the best one. | |||||
| idx_cnt = 0 | |||||
| for i in range(len(graphs)): | |||||
| for j in range(i + 1, len(graphs)): | |||||
| if out[1][i, j] < ged_mat_optim[i ,j]: | |||||
| ged_mat_optim[i, j] = out[1][i, j] | |||||
| ged_mat_optim[j, i] = out[1][j, i] | |||||
| ged_vec[idx_cnt] = out[0][idx_cnt] | |||||
| n_edit_operations[idx_cnt] = out[2][idx_cnt] | |||||
| idx_cnt += 1 | |||||
| return ged_vec, ged_mat_optim, n_edit_operations | |||||
| def _compute_geds_without_permutation(graphs, | |||||
| options={}, | |||||
| sort=True, | |||||
| repeats=1, | |||||
| parallel=False, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| from gklearn.gedlib import librariesImport, gedlibpy | from gklearn.gedlib import librariesImport, gedlibpy | ||||
| # initialize ged env. | # initialize ged env. | ||||
| ged_env = gedlibpy.GEDEnv() | ged_env = gedlibpy.GEDEnv() | ||||
| ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | ||||
| for g in graphs: | for g in graphs: | ||||
| ged_env.add_nx_graph(g, '') | ged_env.add_nx_graph(g, '') | ||||
| listID = ged_env.get_all_graph_ids() | listID = ged_env.get_all_graph_ids() | ||||
| @@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||||
| dis = upper | dis = upper | ||||
| # make the map label correct (label remove map as np.inf) | # make the map label correct (label remove map as np.inf) | ||||
| # Attention: using node indices instead of NetworkX node labels (as | |||||
| # implemented here) may cause several issues: | |||||
| # - Fail if NetworkX node labels are not consecutive integers; | |||||
| # - Return wrong mappings if nodes are permutated (e.g., by using | |||||
| # `gklearn.utis.utils.nx_permute_nodes()`.) | |||||
| nodes1 = [n for n in g1.nodes()] | nodes1 = [n for n in g1.nodes()] | ||||
| nodes2 = [n for n in g2.nodes()] | nodes2 = [n for n in g2.nodes()] | ||||
| nb1 = nx.number_of_nodes(g1) | nb1 = nx.number_of_nodes(g1) | ||||
| @@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||||
| pi_forward_min = pi_forward | pi_forward_min = pi_forward | ||||
| pi_backward_min = pi_backward | pi_backward_min = pi_backward | ||||
| # print('-----') | |||||
| # print(pi_forward_min) | |||||
| # print(pi_backward_min) | |||||
| return dis_min, pi_forward_min, pi_backward_min | return dis_min, pi_forward_min, pi_backward_min | ||||
| def label_costs_to_matrix(costs, nb_labels): | |||||
| """Reform a label cost vector to a matrix. | |||||
| #%% | |||||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): | |||||
| """Calculate the numbers of the occurence of each edit operation in a given | |||||
| edit path. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| costs : numpy.array | |||||
| The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. | |||||
| nb_labels : integer | |||||
| Number of labels. | |||||
| g1 : TYPE | |||||
| DESCRIPTION. | |||||
| g2 : TYPE | |||||
| DESCRIPTION. | |||||
| forward_map : TYPE | |||||
| DESCRIPTION. | |||||
| backward_map : TYPE | |||||
| DESCRIPTION. | |||||
| edit_cost : TYPE, optional | |||||
| DESCRIPTION. The default is None. | |||||
| is_cml : TYPE, optional | |||||
| DESCRIPTION. The default is False. | |||||
| **kwargs : TYPE | |||||
| DESCRIPTION. | |||||
| Raises | |||||
| ------ | |||||
| Exception | |||||
| DESCRIPTION. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| cost_matrix : numpy.array. | |||||
| The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. | |||||
| TYPE | |||||
| DESCRIPTION. | |||||
| Notes | |||||
| ----- | |||||
| Attention: when implementing a function to get the numbers of edit | |||||
| operations, make sure that: | |||||
| - It does not fail if NetworkX node labels are not consecutive integers; | |||||
| - It returns correct results if nodes are permutated (e.g., by using | |||||
| `gklearn.utis.utils.nx_permute_nodes()`.) | |||||
| Generally speaking, it means you need to distinguish the NetworkX label of | |||||
| a node from the position (index) of that node in the node list. | |||||
| """ | """ | ||||
| # Initialize label cost matrix. | |||||
| cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) | |||||
| i = 0 | |||||
| # Costs of insertions. | |||||
| for col in range(1, nb_labels + 1): | |||||
| cost_matrix[0, col] = costs[i] | |||||
| i += 1 | |||||
| # Costs of deletions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| cost_matrix[row, 0] = costs[i] | |||||
| i += 1 | |||||
| # Costs of substitutions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| for col in range(row + 1, nb_labels + 1): | |||||
| cost_matrix[row, col] = costs[i] | |||||
| cost_matrix[col, row] = costs[i] | |||||
| i += 1 | |||||
| return cost_matrix | |||||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): | |||||
| if is_cml: | if is_cml: | ||||
| if edit_cost == 'CONSTANT': | if edit_cost == 'CONSTANT': | ||||
| node_labels = kwargs.get('node_labels', []) | node_labels = kwargs.get('node_labels', []) | ||||
| @@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||||
| return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | ||||
| #%% | |||||
| def label_costs_to_matrix(costs, nb_labels): | |||||
| """Reform a label cost vector to a matrix. | |||||
| Parameters | |||||
| ---------- | |||||
| costs : numpy.array | |||||
| The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. | |||||
| nb_labels : integer | |||||
| Number of labels. | |||||
| Returns | |||||
| ------- | |||||
| cost_matrix : numpy.array. | |||||
| The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. | |||||
| """ | |||||
| # Initialize label cost matrix. | |||||
| cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) | |||||
| i = 0 | |||||
| # Costs of insertions. | |||||
| for col in range(1, nb_labels + 1): | |||||
| cost_matrix[0, col] = costs[i] | |||||
| i += 1 | |||||
| # Costs of deletions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| cost_matrix[row, 0] = costs[i] | |||||
| i += 1 | |||||
| # Costs of substitutions. | |||||
| for row in range(1, nb_labels + 1): | |||||
| for col in range(row + 1, nb_labels + 1): | |||||
| cost_matrix[row, col] = costs[i] | |||||
| cost_matrix[col, row] = costs[i] | |||||
| i += 1 | |||||
| return cost_matrix | |||||
| #%% | |||||
| def ged_options_to_string(options): | def ged_options_to_string(options): | ||||
| opt_str = ' ' | opt_str = ' ' | ||||
| for key, val in options.items(): | for key, val in options.items(): | ||||
| @@ -7,6 +7,9 @@ from enum import Enum, unique | |||||
| # from tqdm import tqdm | # from tqdm import tqdm | ||||
| #%% | |||||
| def getSPLengths(G1): | def getSPLengths(G1): | ||||
| sp = nx.shortest_path(G1) | sp = nx.shortest_path(G1) | ||||
| distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | ||||
| @@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels): | |||||
| return gt | return gt | ||||
| def graph_deepcopy(G): | |||||
| """Deep copy a graph, including deep copy of all nodes, edges and | |||||
| attributes of the graph, nodes and edges. | |||||
| def find_paths(G, source_node, length): | |||||
| """Find all paths with a certain length those start from a source node. | |||||
| A recursive depth first search is applied. | |||||
| Note | |||||
| ---- | |||||
| It is the same as the NetworkX function graph.copy(), as far as I know. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all paths start. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | """ | ||||
| # add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_copy = nx.DiGraph(**labels) | |||||
| else: | |||||
| G_copy = nx.Graph(**labels) | |||||
| if length == 0: | |||||
| return [[source_node]] | |||||
| path = [[source_node] + path for neighbor in G[source_node] \ | |||||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
| return path | |||||
| # add nodes | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_node(nd, **labels) | |||||
| # add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_edge(nd1, nd2, **labels) | |||||
| def find_all_paths(G, length, is_directed): | |||||
| """Find all paths with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| return G_copy | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| all_paths = [] | |||||
| for node in G: | |||||
| all_paths.extend(find_paths(G, node, length)) | |||||
| def graph_isIdentical(G1, G2): | |||||
| """Check if two graphs are identical, including: same nodes, edges, node | |||||
| labels/attributes, edge labels/attributes. | |||||
| if not is_directed: | |||||
| # For each path, two presentations are retrieved from its two extremities. | |||||
| # Remove one of them. | |||||
| all_paths_r = [path[::-1] for path in all_paths] | |||||
| for idx, path in enumerate(all_paths[:-1]): | |||||
| for path2 in all_paths_r[idx+1::]: | |||||
| if path == path2: | |||||
| all_paths[idx] = [] | |||||
| break | |||||
| all_paths = list(filter(lambda a: a != [], all_paths)) | |||||
| Notes | |||||
| ----- | |||||
| 1. The type of graphs has to be the same. | |||||
| return all_paths | |||||
| 2. Global/Graph attributes are neglected as they may contain names for graphs. | |||||
| """ | |||||
| # check nodes. | |||||
| nlist1 = [n for n in G1.nodes(data=True)] | |||||
| nlist2 = [n for n in G2.nodes(data=True)] | |||||
| if not nlist1 == nlist2: | |||||
| return False | |||||
| # check edges. | |||||
| elist1 = [n for n in G1.edges(data=True)] | |||||
| elist2 = [n for n in G2.edges(data=True)] | |||||
| if not elist1 == elist2: | |||||
| return False | |||||
| # check graph attributes. | |||||
| return True | |||||
| # @todo: use it in ShortestPath. | |||||
| def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): | |||||
| """Compute kernels between each pair of vertices in two graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| g1, g2 : NetworkX graph | |||||
| The kernels bewteen pairs of vertices in these two graphs are computed. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns a number | |||||
| as the kernel value. Ignored when nodes are unlabeled. This argument | |||||
| is designated to conjugate gradient method and fixed-point iterations. | |||||
| node_labels : list, optional | |||||
| The list of the name strings of the node labels. The default is []. | |||||
| node_attrs : list, optional | |||||
| The list of the name strings of the node attributes. The default is []. | |||||
| def get_node_labels(Gn, node_label): | |||||
| """Get node labels of dataset Gn. | |||||
| """ | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| Returns | |||||
| ------- | |||||
| vk_dict : dict | |||||
| Vertex kernels keyed by vertices. | |||||
| Notes | |||||
| ----- | |||||
| This function is used by ``gklearn.kernels.FixedPoint'' and | |||||
| ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| """Get edge labels of dataset Gn. | |||||
| References | |||||
| ---------- | |||||
| .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. | |||||
| Parallelization of shortest path graph kernels on multi-core cpus and gpus. | |||||
| Proceedings of the Programmability Issues for Heterogeneous Multicores | |||||
| (MultiProg), Vienna, Austria, 2014. | |||||
| """ | """ | ||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| # node unlabeled | |||||
| else: | |||||
| pass # @todo: add edge weights. | |||||
| # for e1 in g1.edges(data=True): | |||||
| # for e2 in g2.edges(data=True): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| return vk_dict | |||||
| #%% | |||||
| def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | ||||
| @@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d | |||||
| print('\ncomplete.') | print('\ncomplete.') | ||||
| def find_paths(G, source_node, length): | |||||
| """Find all paths with a certain length those start from a source node. | |||||
| A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all paths start. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| if length == 0: | |||||
| return [[source_node]] | |||||
| path = [[source_node] + path for neighbor in G[source_node] \ | |||||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
| return path | |||||
| def find_all_paths(G, length, is_directed): | |||||
| """Find all paths with a certain length in a graph. A recursive depth first | |||||
| search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| all_paths = [] | |||||
| for node in G: | |||||
| all_paths.extend(find_paths(G, node, length)) | |||||
| if not is_directed: | |||||
| # For each path, two presentations are retrieved from its two extremities. | |||||
| # Remove one of them. | |||||
| all_paths_r = [path[::-1] for path in all_paths] | |||||
| for idx, path in enumerate(all_paths[:-1]): | |||||
| for path2 in all_paths_r[idx+1::]: | |||||
| if path == path2: | |||||
| all_paths[idx] = [] | |||||
| break | |||||
| all_paths = list(filter(lambda a: a != [], all_paths)) | |||||
| return all_paths | |||||
| def get_mlti_dim_node_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def get_mlti_dim_edge_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for ed, attrs in G.edges(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def normalize_gram_matrix(gram_matrix): | def normalize_gram_matrix(gram_matrix): | ||||
| diag = gram_matrix.diagonal().copy() | diag = gram_matrix.diagonal().copy() | ||||
| old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | ||||
| @@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix): | |||||
| return dis_mat, dis_max, dis_min, dis_mean | return dis_mat, dis_max, dis_min, dis_mean | ||||
| # @todo: use it in ShortestPath. | |||||
| def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): | |||||
| """Compute kernels between each pair of vertices in two graphs. | |||||
| #%% | |||||
| def graph_deepcopy(G): | |||||
| """Deep copy a graph, including deep copy of all nodes, edges and | |||||
| attributes of the graph, nodes and edges. | |||||
| Note | |||||
| ---- | |||||
| - It is the same as the NetworkX function graph.copy(), as far as I know. | |||||
| - This function only supports Networkx.Graph and Networkx.DiGraph. | |||||
| """ | |||||
| # add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_copy = nx.DiGraph(**labels) | |||||
| else: | |||||
| G_copy = nx.Graph(**labels) | |||||
| # add nodes | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_node(nd, **labels) | |||||
| # add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_copy.add_edge(nd1, nd2, **labels) | |||||
| return G_copy | |||||
| def graph_isIdentical(G1, G2): | |||||
| """Check if two graphs are identical, including: same nodes, edges, node | |||||
| labels/attributes, edge labels/attributes. | |||||
| Notes | |||||
| ----- | |||||
| 1. The type of graphs has to be the same. | |||||
| 2. Global/Graph attributes are neglected as they may contain names for graphs. | |||||
| """ | |||||
| # check nodes. | |||||
| nlist1 = [n for n in G1.nodes(data=True)] | |||||
| nlist2 = [n for n in G2.nodes(data=True)] | |||||
| if not nlist1 == nlist2: | |||||
| return False | |||||
| # check edges. | |||||
| elist1 = [n for n in G1.edges(data=True)] | |||||
| elist2 = [n for n in G2.edges(data=True)] | |||||
| if not elist1 == elist2: | |||||
| return False | |||||
| # check graph attributes. | |||||
| return True | |||||
| def get_node_labels(Gn, node_label): | |||||
| """Get node labels of dataset Gn. | |||||
| """ | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| """Get edge labels of dataset Gn. | |||||
| """ | |||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||
| def get_mlti_dim_node_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def get_mlti_dim_edge_attrs(G, attr_names): | |||||
| attributes = [] | |||||
| for ed, attrs in G.edges(data=True): | |||||
| attributes.append(tuple(attrs[aname] for aname in attr_names)) | |||||
| return attributes | |||||
| def nx_permute_nodes(G, random_state=None): | |||||
| """Permute node indices in a NetworkX graph. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| g1, g2 : NetworkX graph | |||||
| The kernels bewteen pairs of vertices in these two graphs are computed. | |||||
| node_kernels : dict | |||||
| A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
| for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
| for both labels. The first 2 functions take two node labels as | |||||
| parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
| non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
| dimension array (n_samples, n_features). Each function returns a number | |||||
| as the kernel value. Ignored when nodes are unlabeled. This argument | |||||
| is designated to conjugate gradient method and fixed-point iterations. | |||||
| node_labels : list, optional | |||||
| The list of the name strings of the node labels. The default is []. | |||||
| node_attrs : list, optional | |||||
| The list of the name strings of the node attributes. The default is []. | |||||
| G : TYPE | |||||
| DESCRIPTION. | |||||
| random_state : TYPE, optional | |||||
| DESCRIPTION. The default is None. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| vk_dict : dict | |||||
| Vertex kernels keyed by vertices. | |||||
| G_new : TYPE | |||||
| DESCRIPTION. | |||||
| Notes | Notes | ||||
| ----- | ----- | ||||
| This function is used by ``gklearn.kernels.FixedPoint'' and | |||||
| ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. | |||||
| References | |||||
| ---------- | |||||
| .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. | |||||
| Parallelization of shortest path graph kernels on multi-core cpus and gpus. | |||||
| Proceedings of the Programmability Issues for Heterogeneous Multicores | |||||
| (MultiProg), Vienna, Austria, 2014. | |||||
| - This function only supports Networkx.Graph and Networkx.DiGraph. | |||||
| """ | """ | ||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['mix'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| # @todo: relabel node with integers? (in case something went wrong...) | |||||
| # Add graph attributes. | |||||
| labels = {} | |||||
| for k, v in G.graph.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| if G.is_directed(): | |||||
| G_new = nx.DiGraph(**labels) | |||||
| else: | else: | ||||
| # node non-synb labeled | |||||
| if len(node_attrs) > 0: | |||||
| kn = node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| # node unlabeled | |||||
| else: | |||||
| pass # @todo: add edge weights. | |||||
| # for e1 in g1.edges(data=True): | |||||
| # for e2 in g2.edges(data=True): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| G_new = nx.Graph(**labels) | |||||
| return vk_dict | |||||
| # Create a random mapping old node indices <-> new indices. | |||||
| nb_nodes = nx.number_of_nodes(G) | |||||
| indices_orig = range(nb_nodes) | |||||
| idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig) | |||||
| # Add nodes. | |||||
| nodes_orig = list(G.nodes) | |||||
| for i_orig in range(nb_nodes): | |||||
| i_new = idx_mapping[i_orig] | |||||
| labels = {} | |||||
| for k, v in G.nodes[nodes_orig[i_new]].items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_new.add_node(nodes_orig[i_new], **labels) | |||||
| # Add edges. | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| labels = {} | |||||
| for k, v in attrs.items(): | |||||
| labels[k] = deepcopy(v) | |||||
| G_new.add_edge(nd1, nd2, **labels) | |||||
| # # create a random mapping old label -> new label | |||||
| # node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes()))) | |||||
| # # build a new graph | |||||
| # G_new = nx.relabel_nodes(G, node_mapping) | |||||
| return G_new | |||||
| #%% | |||||
| def dummy_node(): | def dummy_node(): | ||||