2. add helper function generate_median_preimage_by_class in gklearn.preimage.utils.tags/v0.2.0
| @@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found | |||||
| ## Authors | ## Authors | ||||
| * [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie | |||||
| * [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie | |||||
| * [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie | * [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie | ||||
| * [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie | * [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie | ||||
| @@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string | |||||
| from gklearn.ged.median import MedianGraphEstimator | from gklearn.ged.median import MedianGraphEstimator | ||||
| from gklearn.ged.median import constant_node_costs,mge_options_to_string | from gklearn.ged.median import constant_node_costs,mge_options_to_string | ||||
| from gklearn.gedlib import librariesImport, gedlibpy | from gklearn.gedlib import librariesImport, gedlibpy | ||||
| from gklearn.utils import Timer | |||||
| # from gklearn.utils.dataset import Dataset | # from gklearn.utils.dataset import Dataset | ||||
| class MedianPreimageGenerator(PreimageGenerator): | class MedianPreimageGenerator(PreimageGenerator): | ||||
| @@ -29,10 +30,13 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__mge_options = {} | self.__mge_options = {} | ||||
| self.__fit_method = 'k-graphs' | self.__fit_method = 'k-graphs' | ||||
| self.__init_ecc = None | self.__init_ecc = None | ||||
| self.__max_itrs = 100 | |||||
| self.__parallel = True | self.__parallel = True | ||||
| self.__n_jobs = multiprocessing.cpu_count() | self.__n_jobs = multiprocessing.cpu_count() | ||||
| self.__ds_name = None | self.__ds_name = None | ||||
| self.__time_limit_in_sec = 0 | |||||
| self.__max_itrs = 100 | |||||
| self.__max_itrs_without_update = 3 | |||||
| self.__epsilon_ratio = 0.01 | |||||
| # values to compute. | # values to compute. | ||||
| self.__edit_cost_constants = [] | self.__edit_cost_constants = [] | ||||
| self.__runtime_precompute_gm = None | self.__runtime_precompute_gm = None | ||||
| @@ -41,11 +45,15 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__runtime_total = None | self.__runtime_total = None | ||||
| self.__set_median = None | self.__set_median = None | ||||
| self.__gen_median = None | self.__gen_median = None | ||||
| self.__best_from_dataset = None | |||||
| self.__sod_set_median = None | self.__sod_set_median = None | ||||
| self.__sod_gen_median = None | self.__sod_gen_median = None | ||||
| self.__k_dis_set_median = None | self.__k_dis_set_median = None | ||||
| self.__k_dis_gen_median = None | self.__k_dis_gen_median = None | ||||
| self.__k_dis_dataset = None | self.__k_dis_dataset = None | ||||
| self.__itrs = 0 | |||||
| self.__converged = False | |||||
| self.__num_updates_ecc = 0 | |||||
| def set_options(self, **kwargs): | def set_options(self, **kwargs): | ||||
| @@ -57,10 +65,13 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__fit_method = kwargs.get('fit_method', 'k-graphs') | self.__fit_method = kwargs.get('fit_method', 'k-graphs') | ||||
| self.__init_ecc = kwargs.get('init_ecc', None) | self.__init_ecc = kwargs.get('init_ecc', None) | ||||
| self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) | self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) | ||||
| self.__max_itrs = kwargs.get('max_itrs', 100) | |||||
| self.__parallel = kwargs.get('parallel', True) | self.__parallel = kwargs.get('parallel', True) | ||||
| self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | ||||
| self.__ds_name = kwargs.get('ds_name', None) | self.__ds_name = kwargs.get('ds_name', None) | ||||
| self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) | |||||
| self.__max_itrs = kwargs.get('max_itrs', 100) | |||||
| self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) | |||||
| self.__epsilon_ratio = kwargs.get('epsilon_ratio', 0.01) | |||||
| def run(self): | def run(self): | ||||
| @@ -75,7 +86,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__runtime_precompute_gm = end_precompute_gm - start | self.__runtime_precompute_gm = end_precompute_gm - start | ||||
| # 2. optimize edit cost constants. | # 2. optimize edit cost constants. | ||||
| # self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median) | |||||
| self.__optimize_edit_cost_constants() | self.__optimize_edit_cost_constants() | ||||
| end_optimize_ec = time.time() | end_optimize_ec = time.time() | ||||
| self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm | self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm | ||||
| @@ -108,28 +118,47 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| if self._verbose: | if self._verbose: | ||||
| print() | print() | ||||
| print('================================================================================') | print('================================================================================') | ||||
| print('The optimized edit cost constants: ', self.__edit_cost_constants) | |||||
| print('SOD of the set median: ', self.__sod_set_median) | |||||
| print('SOD of the generalized median: ', self.__sod_gen_median) | |||||
| print('Finished generalization of preimages.') | |||||
| print('--------------------------------------------------------------------------------') | |||||
| print('The optimized edit cost constants:', self.__edit_cost_constants) | |||||
| print('SOD of the set median:', self.__sod_set_median) | |||||
| print('SOD of the generalized median:', self.__sod_gen_median) | |||||
| print('Distance in kernel space for set median:', self.__k_dis_set_median) | print('Distance in kernel space for set median:', self.__k_dis_set_median) | ||||
| print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) | print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) | ||||
| print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | ||||
| print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm) | |||||
| print('Time to optimize edit costs: ', self.__runtime_optimize_ec) | |||||
| print('Time to generate pre-images: ', self.__runtime_generate_preimage) | |||||
| print('Total time: ', self.__runtime_total) | |||||
| print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||||
| print('Time to optimize edit costs:', self.__runtime_optimize_ec) | |||||
| print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||||
| print('Total time:', self.__runtime_total) | |||||
| print('Total number of iterations for optimizing:', self.__itrs) | |||||
| print('Total number of updating edit costs:', self.__num_updates_ecc) | |||||
| print('Is optimization of edit costs converged:', self.__converged) | |||||
| print('================================================================================') | print('================================================================================') | ||||
| # collect return values. | # collect return values. | ||||
| # return (sod_sm, sod_gm), \ | # return (sod_sm, sod_gm), \ | ||||
| # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | ||||
| # (time_fitting, time_generating) | # (time_fitting, time_generating) | ||||
| def get_results(self): | |||||
| results = {} | |||||
| results['edit_cost_constants'] = self.__edit_cost_constants | |||||
| results['runtime_precompute_gm'] = self.__runtime_precompute_gm | |||||
| results['runtime_optimize_ec'] = self.__runtime_optimize_ec | |||||
| results['runtime_generate_preimage'] = self.__runtime_generate_preimage | |||||
| results['runtime_total'] = self.__runtime_total | |||||
| results['sod_set_median'] = self.__sod_set_median | |||||
| results['sod_gen_median'] = self.__sod_gen_median | |||||
| results['k_dis_set_median'] = self.__k_dis_set_median | |||||
| results['k_dis_gen_median'] = self.__k_dis_gen_median | |||||
| results['k_dis_dataset'] = self.__k_dis_dataset | |||||
| results['itrs'] = self.__itrs | |||||
| results['converged'] = self.__converged | |||||
| results['num_updates_ecc'] = self.__num_updates_ecc | |||||
| return results | |||||
| # def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None): | |||||
| def __optimize_edit_cost_constants(self): | def __optimize_edit_cost_constants(self): | ||||
| """fit edit cost constants. | """fit edit cost constants. | ||||
| """ | """ | ||||
| @@ -177,8 +206,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__init_ecc = [3, 3, 1, 3, 3, 1] | self.__init_ecc = [3, 3, 1, 3, 3, 1] | ||||
| # optimize on the k-graph subset. | # optimize on the k-graph subset. | ||||
| self.__optimize_ecc_by_kernel_distances() | self.__optimize_ecc_by_kernel_distances() | ||||
| # fit_GED_to_kernel_distance(Gn_median, | |||||
| # dataset=dataset, Kmatrix=Kmatrix_median) | |||||
| elif self.__fit_method == 'whole-dataset': | elif self.__fit_method == 'whole-dataset': | ||||
| if self.__init_ecc is None: | if self.__init_ecc is None: | ||||
| if self.__ged_options['edit_cost'] == 'LETTER': | if self.__ged_options['edit_cost'] == 'LETTER': | ||||
| @@ -189,15 +216,11 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| self.__init_ecc = [3, 3, 1, 3, 3, 1] | self.__init_ecc = [3, 3, 1, 3, 3, 1] | ||||
| # optimizeon the whole set. | # optimizeon the whole set. | ||||
| self.__optimize_ecc_by_kernel_distances() | self.__optimize_ecc_by_kernel_distances() | ||||
| # fit_GED_to_kernel_distance(Gn, dataset=dataset) | |||||
| elif self.__fit_method == 'precomputed': | elif self.__fit_method == 'precomputed': | ||||
| pass | pass | ||||
| def __optimize_ecc_by_kernel_distances(self): | |||||
| # def fit_GED_to_kernel_distance(Gn, Kmatrix=None, | |||||
| # parallel=True): | |||||
| def __optimize_ecc_by_kernel_distances(self): | |||||
| # compute distances in feature space. | # compute distances in feature space. | ||||
| dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() | dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() | ||||
| dis_k_vec = [] | dis_k_vec = [] | ||||
| @@ -222,20 +245,25 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| nb_cost_mat = np.array(n_edit_operations) | nb_cost_mat = np.array(n_edit_operations) | ||||
| nb_cost_mat_list = [nb_cost_mat] | nb_cost_mat_list = [nb_cost_mat] | ||||
| if self._verbose >= 2: | if self._verbose >= 2: | ||||
| print('edit_cost_constants:', self.__edit_cost_constants) | |||||
| print('residual_list:', residual_list) | |||||
| for itr in range(self.__max_itrs): | |||||
| print('Current edit cost constants:', self.__edit_cost_constants) | |||||
| print('Residual list:', residual_list) | |||||
| # run iteration from initial edit costs. | |||||
| self.__converged = False | |||||
| itrs_without_update = 0 | |||||
| self.__itrs = 0 | |||||
| self.__num_updates_ecc = 0 | |||||
| timer = Timer(self.__time_limit_in_sec) | |||||
| while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): | |||||
| if self._verbose >= 2: | if self._verbose >= 2: | ||||
| print('\niteration', itr) | |||||
| print('\niteration', self.__itrs) | |||||
| time0 = time.time() | time0 = time.time() | ||||
| # "fit" geds to distances in feature space by tuning edit costs using the | |||||
| # Least Squares Method. | |||||
| np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', | |||||
| nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, | |||||
| n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, | |||||
| ged_mat=ged_mat) | |||||
| self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec) | |||||
| # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. | |||||
| # np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', | |||||
| # nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, | |||||
| # n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, | |||||
| # ged_mat=ged_mat) | |||||
| self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) | |||||
| for i in range(len(self.__edit_cost_constants)): | for i in range(len(self.__edit_cost_constants)): | ||||
| if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: | if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: | ||||
| self.__edit_cost_constants[i] = 0 | self.__edit_cost_constants[i] = 0 | ||||
| @@ -254,12 +282,59 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| edit_cost_list.append(self.__edit_cost_constants) | edit_cost_list.append(self.__edit_cost_constants) | ||||
| nb_cost_mat = np.array(n_edit_operations) | nb_cost_mat = np.array(n_edit_operations) | ||||
| nb_cost_mat_list.append(nb_cost_mat) | nb_cost_mat_list.append(nb_cost_mat) | ||||
| # check convergency. | |||||
| ec_changed = False | |||||
| for i, cost in enumerate(self.__edit_cost_constants): | |||||
| # if cost == 0: | |||||
| # if edit_cost_list[-2][i] > self.__epsilon_ratio: | |||||
| # ec_changed = True | |||||
| # break | |||||
| # elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ratio: | |||||
| # ec_changed = True | |||||
| # break | |||||
| if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ratio: | |||||
| ec_changed = True | |||||
| break | |||||
| residual_changed = False | |||||
| if residual_list[-1] == 0: | |||||
| if residual_list[-2] > self.__epsilon_ratio: | |||||
| residual_changed = True | |||||
| elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_ratio: | |||||
| residual_changed = True | |||||
| self.__converged = not (ec_changed or residual_changed) | |||||
| if self.__converged: | |||||
| itrs_without_update += 1 | |||||
| else: | |||||
| itrs_without_update = 0 | |||||
| self.__num_updates_ecc += 1 | |||||
| # print current states. | |||||
| if self._verbose >= 2: | if self._verbose >= 2: | ||||
| print('edit_cost_constants:', self.__edit_cost_constants) | |||||
| print('residual_list:', residual_list) | |||||
| # return residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
| # time_list, nb_cost_mat_list | |||||
| print() | |||||
| print('-------------------------------------------------------------------------') | |||||
| print('States of iteration', str(self.__itrs)) | |||||
| print('-------------------------------------------------------------------------') | |||||
| # print('Time spend:', self.__runtime_optimize_ec) | |||||
| print('Total number of iterations for optimizing:', self.__itrs) | |||||
| print('Total number of updating edit costs:', self.__num_updates_ecc) | |||||
| print('Is optimization of edit costs converged:', self.__converged) | |||||
| print('Does edit cost changed:', ec_changed) | |||||
| print('Does residual changed:', residual_changed) | |||||
| print('Iterations without update:', itrs_without_update) | |||||
| print('Current edit cost constants:', self.__edit_cost_constants) | |||||
| print('Residual list:', residual_list) | |||||
| print('-------------------------------------------------------------------------') | |||||
| self.__itrs += 1 | |||||
| def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): | |||||
| if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||||
| # if self.__state == AlgorithmState.TERMINATED: | |||||
| # self.__state = AlgorithmState.INITIALIZED | |||||
| return True | |||||
| return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||||
| def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): | def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): | ||||
| @@ -591,6 +666,7 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| gram_with_gm, withterm3=False)) | gram_with_gm, withterm3=False)) | ||||
| idx_k_dis_median_set_min = np.argmin(k_dis_median_set) | idx_k_dis_median_set_min = np.argmin(k_dis_median_set) | ||||
| self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] | self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] | ||||
| self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() | |||||
| if self._verbose >= 2: | if self._verbose >= 2: | ||||
| print() | print() | ||||
| @@ -599,8 +675,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | ||||
| print('distance in kernel space for each graph in median set:', k_dis_median_set) | print('distance in kernel space for each graph in median set:', k_dis_median_set) | ||||
| # return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min | |||||
| def __set_graph_kernel_by_name(self): | def __set_graph_kernel_by_name(self): | ||||
| if self.kernel_options['name'] == 'structuralspkernel': | if self.kernel_options['name'] == 'structuralspkernel': | ||||
| @@ -670,5 +744,20 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
| return self.__init_ecc | return self.__init_ecc | ||||
| @init_ecc.setter | @init_ecc.setter | ||||
| def fit_method(self, value): | |||||
| self.__init_ecc = value | |||||
| def init_ecc(self, value): | |||||
| self.__init_ecc = value | |||||
| @property | |||||
| def set_median(self): | |||||
| return self.__set_median | |||||
| @property | |||||
| def gen_median(self): | |||||
| return self.__gen_median | |||||
| @property | |||||
| def best_from_dataset(self): | |||||
| return self.__best_from_dataset | |||||
| @@ -20,9 +20,12 @@ def test_median_preimage_generator(): | |||||
| mpg = MedianPreimageGenerator() | mpg = MedianPreimageGenerator() | ||||
| mpg_options = {'fit_method': 'k-graphs', | mpg_options = {'fit_method': 'k-graphs', | ||||
| 'init_ecc': [3, 3, 1, 3, 3], | 'init_ecc': [3, 3, 1, 3, 3], | ||||
| 'max_itrs': 6, | |||||
| 'ds_name': 'Letter-high', | 'ds_name': 'Letter-high', | ||||
| 'parallel': True, | 'parallel': True, | ||||
| 'time_limit_in_sec': 0, | |||||
| 'max_itrs': 100, | |||||
| 'max_itrs_without_update': 3, | |||||
| 'epsilon_ratio': 0.01, | |||||
| 'verbose': 2} | 'verbose': 2} | ||||
| mpg.set_options(**mpg_options) | mpg.set_options(**mpg_options) | ||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
| @@ -19,146 +19,408 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po | |||||
| from gklearn.kernels.structuralspKernel import structuralspkernel | from gklearn.kernels.structuralspKernel import structuralspkernel | ||||
| from gklearn.kernels.treeletKernel import treeletkernel | from gklearn.kernels.treeletKernel import treeletkernel | ||||
| from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | ||||
| from gklearn.utils import Dataset | |||||
| import csv | |||||
| import matplotlib.pyplot as plt | |||||
| import networkx as nx | |||||
| def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ): | |||||
| from gklearn.preimage import MedianPreimageGenerator | |||||
| from gklearn.utils import split_dataset_by_target | |||||
| from gklearn.utils.graphfiles import saveGXL | |||||
| # 1. get dataset. | |||||
| print('getting dataset...') | |||||
| dataset_all = Dataset() | |||||
| dataset_all.load_predefined_dataset(ds_name) | |||||
| datasets = split_dataset_by_target(dataset_all) | |||||
| # dataset.cut_graphs(range(0, 10)) | |||||
| if save_results: | |||||
| # create result files. | |||||
| print('creating output files...') | |||||
| fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save) | |||||
| sod_sm_list = [] | |||||
| sod_gm_list = [] | |||||
| dis_k_sm_list = [] | |||||
| dis_k_gm_list = [] | |||||
| dis_k_gi_min_list = [] | |||||
| time_precompute_gm_list = [] | |||||
| time_optimize_ec_list = [] | |||||
| time_generate_list = [] | |||||
| time_total_list = [] | |||||
| itrs_list = [] | |||||
| converged_list = [] | |||||
| num_updates_ecc_list = [] | |||||
| nb_sod_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_gi2sm = [0, 0, 0] | |||||
| nb_dis_k_gi2gm = [0, 0, 0] | |||||
| # repeats_better_sod_sm2gm = [] | |||||
| # repeats_better_dis_k_sm2gm = [] | |||||
| # repeats_better_dis_k_gi2sm = [] | |||||
| # repeats_better_dis_k_gi2gm = [] | |||||
| print('start generating preimage for each class of target...') | |||||
| for dataset in datasets: | |||||
| print('\ntarget =', dataset.targets[0], '\n') | |||||
| num_graphs = len(dataset.graphs) | |||||
| if num_graphs < 2: | |||||
| print('\nnumber of graphs = ', num_graphs, ', skip.\n') | |||||
| continue | |||||
| # 2. set parameters. | |||||
| print('1. initializing mpg and setting parameters...') | |||||
| mpg = MedianPreimageGenerator() | |||||
| mpg.dataset = dataset | |||||
| mpg.set_options(**mpg_options.copy()) | |||||
| mpg.kernel_options = kernel_options.copy() | |||||
| mpg.ged_options = ged_options.copy() | |||||
| mpg.mge_options = mge_options.copy() | |||||
| # 3. compute median preimage. | |||||
| print('2. computing median preimage...') | |||||
| mpg.run() | |||||
| results = mpg.get_results() | |||||
| # write result detail. | |||||
| if save_results: | |||||
| print('writing results to files...') | |||||
| sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median'])) | |||||
| dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median'])) | |||||
| dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset'])) | |||||
| dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset'])) | |||||
| f_detail = open(dir_save + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow([ds_name, kernel_options['name'], | |||||
| ged_options['edit_cost'], ged_options['method'], | |||||
| ged_options['attr_distance'], mpg_options['fit_method'], | |||||
| num_graphs, dataset.targets[0], 1, | |||||
| results['sod_set_median'], results['sod_gen_median'], | |||||
| results['k_dis_set_median'], results['k_dis_gen_median'], | |||||
| results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, | |||||
| dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'], | |||||
| results['runtime_precompute_gm'], results['runtime_optimize_ec'], | |||||
| results['runtime_generate_preimage'], results['runtime_total'], | |||||
| results['itrs'], results['converged'], | |||||
| results['num_updates_ecc']]) | |||||
| f_detail.close() | |||||
| # compute result summary. | |||||
| sod_sm_list.append(results['sod_set_median']) | |||||
| sod_gm_list.append(results['sod_gen_median']) | |||||
| dis_k_sm_list.append(results['k_dis_set_median']) | |||||
| dis_k_gm_list.append(results['k_dis_gen_median']) | |||||
| dis_k_gi_min_list.append(results['k_dis_dataset']) | |||||
| time_precompute_gm_list.append(results['runtime_precompute_gm']) | |||||
| time_optimize_ec_list.append(results['runtime_optimize_ec']) | |||||
| time_generate_list.append(results['runtime_generate_preimage']) | |||||
| time_total_list.append(results['runtime_total']) | |||||
| itrs_list.append(results['itrs']) | |||||
| converged_list.append(results['converged']) | |||||
| num_updates_ecc_list.append(results['num_updates_ecc']) | |||||
| # # SOD SM -> GM | |||||
| if results['sod_set_median'] > results['sod_gen_median']: | |||||
| nb_sod_sm2gm[0] += 1 | |||||
| # repeats_better_sod_sm2gm.append(1) | |||||
| elif results['sod_set_median'] == results['sod_gen_median']: | |||||
| nb_sod_sm2gm[1] += 1 | |||||
| elif results['sod_set_median'] < results['sod_gen_median']: | |||||
| nb_sod_sm2gm[2] += 1 | |||||
| # # dis_k SM -> GM | |||||
| if results['k_dis_set_median'] > results['k_dis_gen_median']: | |||||
| nb_dis_k_sm2gm[0] += 1 | |||||
| # repeats_better_dis_k_sm2gm.append(1) | |||||
| elif results['k_dis_set_median'] == results['k_dis_gen_median']: | |||||
| nb_dis_k_sm2gm[1] += 1 | |||||
| elif results['k_dis_set_median'] < results['k_dis_gen_median']: | |||||
| nb_dis_k_sm2gm[2] += 1 | |||||
| # # dis_k gi -> SM | |||||
| if results['k_dis_dataset'] > results['k_dis_set_median']: | |||||
| nb_dis_k_gi2sm[0] += 1 | |||||
| # repeats_better_dis_k_gi2sm.append(1) | |||||
| elif results['k_dis_dataset'] == results['k_dis_set_median']: | |||||
| nb_dis_k_gi2sm[1] += 1 | |||||
| elif results['k_dis_dataset'] < results['k_dis_set_median']: | |||||
| nb_dis_k_gi2sm[2] += 1 | |||||
| # # dis_k gi -> GM | |||||
| if results['k_dis_dataset'] > results['k_dis_gen_median']: | |||||
| nb_dis_k_gi2gm[0] += 1 | |||||
| # repeats_better_dis_k_gi2gm.append(1) | |||||
| elif results['k_dis_dataset'] == results['k_dis_gen_median']: | |||||
| nb_dis_k_gi2gm[1] += 1 | |||||
| elif results['k_dis_dataset'] < results['k_dis_gen_median']: | |||||
| nb_dis_k_gi2gm[2] += 1 | |||||
| # write result summary for each letter. | |||||
| f_summary = open(dir_save + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||||
| ged_options['edit_cost'], ged_options['method'], | |||||
| ged_options['attr_distance'], mpg_options['fit_method'], | |||||
| num_graphs, dataset.targets[0], | |||||
| results['sod_set_median'], results['sod_gen_median'], | |||||
| results['k_dis_set_median'], results['k_dis_gen_median'], | |||||
| results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, | |||||
| dis_k_gi2sm, dis_k_gi2gm, | |||||
| results['runtime_precompute_gm'], results['runtime_optimize_ec'], | |||||
| results['runtime_generate_preimage'], results['runtime_total'], | |||||
| results['itrs'], results['converged'], | |||||
| results['num_updates_ecc'], nb_sod_sm2gm, | |||||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm]) | |||||
| f_summary.close() | |||||
| # save median graphs. | |||||
| if save_medians: | |||||
| fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) | |||||
| saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||||
| node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
| node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||||
| fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) | |||||
| saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', | |||||
| node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
| node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||||
| fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) | |||||
| saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||||
| node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
| node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||||
| # plot median graphs. | |||||
| if plot_medians and save_medians: | |||||
| if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': | |||||
| draw_Letter_graph(mpg.set_median, fn_pre_sm) | |||||
| draw_Letter_graph(mpg.gen_median, fn_pre_gm) | |||||
| draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) | |||||
| # write result summary for each letter. | |||||
| if save_results: | |||||
| sod_sm_mean = np.mean(sod_sm_list) | |||||
| sod_gm_mean = np.mean(sod_gm_list) | |||||
| dis_k_sm_mean = np.mean(dis_k_sm_list) | |||||
| dis_k_gm_mean = np.mean(dis_k_gm_list) | |||||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||||
| time_precompute_gm_mean = np.mean(time_precompute_gm_list) | |||||
| time_optimize_ec_mean = np.mean(time_optimize_ec_list) | |||||
| time_generate_mean = np.mean(time_generate_list) | |||||
| time_total_mean = np.mean(time_total_list) | |||||
| itrs_mean = np.mean(itrs_list) | |||||
| num_converged = np.sum(converged_list) | |||||
| num_updates_ecc_mean = np.mean(num_updates_ecc_list) | |||||
| sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean)) | |||||
| dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||||
| dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||||
| dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||||
| f_summary = open(dir_save + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||||
| ged_options['edit_cost'], ged_options['method'], | |||||
| ged_options['attr_distance'], mpg_options['fit_method'], | |||||
| num_graphs, 'all', | |||||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, | |||||
| time_precompute_gm_mean, time_optimize_ec_mean, | |||||
| time_generate_mean, time_total_mean, itrs_mean, | |||||
| num_converged, num_updates_ecc_mean]) | |||||
| f_summary.close() | |||||
| print('\ncomplete.') | |||||
| def __init_output_file(ds_name, gkernel, fit_method, dir_output): | |||||
| # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | |||||
| 'GED method', 'attr distance', 'fit method', 'k', | |||||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', | |||||
| 'time optimize ec', 'time generate preimage', 'time total', | |||||
| 'itrs', 'converged', 'num updates ecc']) | |||||
| f_detail.close() | |||||
| # fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', | |||||
| 'GED method', 'attr distance', 'fit method', 'k', | |||||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', | |||||
| 'time generate preimage', 'time total', 'itrs', 'num converged', | |||||
| 'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
| '# dis_k gi -> SM', '# dis_k gi -> GM']) | |||||
| # 'repeats better SOD SM -> GM', | |||||
| # 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
| # 'repeats better dis_k gi -> GM']) | |||||
| f_summary.close() | |||||
| return fn_output_detail, fn_output_summary | |||||
| def get_relations(sign): | |||||
| if sign == -1: | |||||
| return 'better' | |||||
| elif sign == 0: | |||||
| return 'same' | |||||
| elif sign == 1: | |||||
| return 'worse' | |||||
| #Dessin median courrant | |||||
| def draw_Letter_graph(graph, file_prefix): | |||||
| plt.figure() | |||||
| pos = {} | |||||
| for n in graph.nodes: | |||||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||||
| nx.draw_networkx(graph, pos) | |||||
| plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| def remove_edges(Gn): | def remove_edges(Gn): | ||||
| for G in Gn: | |||||
| for _, _, attrs in G.edges(data=True): | |||||
| attrs.clear() | |||||
| for G in Gn: | |||||
| for _, _, attrs in G.edges(data=True): | |||||
| attrs.clear() | |||||
| def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | ||||
| term1 = Kmatrix[idx_g, idx_g] | |||||
| term2 = 0 | |||||
| for i, a in enumerate(alpha): | |||||
| term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||||
| term2 *= 2 | |||||
| if withterm3 == False: | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| return np.sqrt(term1 - term2 + term3) | |||||
| term1 = Kmatrix[idx_g, idx_g] | |||||
| term2 = 0 | |||||
| for i, a in enumerate(alpha): | |||||
| term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||||
| term2 *= 2 | |||||
| if withterm3 == False: | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| return np.sqrt(term1 - term2 + term3) | |||||
| def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | ||||
| term1 = Kmatrix[idx_g, idx_g] | |||||
| term2 = 0 | |||||
| for i, a in enumerate(alpha): | |||||
| term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||||
| term2 *= 2 | |||||
| if withterm3 == False: | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| return np.sqrt(term1 - term2 + term3) | |||||
| term1 = Kmatrix[idx_g, idx_g] | |||||
| term2 = 0 | |||||
| for i, a in enumerate(alpha): | |||||
| term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||||
| term2 *= 2 | |||||
| if withterm3 == False: | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| return np.sqrt(term1 - term2 + term3) | |||||
| def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): | def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): | ||||
| if graph_kernel == 'marginalizedkernel': | |||||
| Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| p_quit=0.03, n_iteration=10, remove_totters=False, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'untilhpathkernel': | |||||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| depth=7, k_func='MinMax', compute_method='trie', | |||||
| parallel=parallel, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'spkernel': | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| Kmatrix = np.empty((len(Gn), len(Gn))) | |||||
| # Kmatrix[:] = np.nan | |||||
| Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= | |||||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| # for i, row in enumerate(idx): | |||||
| # for j, col in enumerate(idx): | |||||
| # Kmatrix[row, col] = Kmatrix_tmp[i, j] | |||||
| elif graph_kernel == 'structuralspkernel': | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, | |||||
| edge_label=edge_label, node_kernels=sub_kernels, | |||||
| edge_kernels=sub_kernels, | |||||
| parallel=parallel, n_jobs=multiprocessing.cpu_count(), | |||||
| verbose=verbose) | |||||
| elif graph_kernel == 'treeletkernel': | |||||
| pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
| # pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| sub_kernel=pkernel, parallel=parallel, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'weisfeilerlehmankernel': | |||||
| Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| height=4, base_kernel='subtree', parallel=None, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| # normalization | |||||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
| for i in range(len(Kmatrix)): | |||||
| for j in range(i, len(Kmatrix)): | |||||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| return Kmatrix | |||||
| if graph_kernel == 'marginalizedkernel': | |||||
| Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| p_quit=0.03, n_iteration=10, remove_totters=False, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'untilhpathkernel': | |||||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| depth=7, k_func='MinMax', compute_method='trie', | |||||
| parallel=parallel, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'spkernel': | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| Kmatrix = np.empty((len(Gn), len(Gn))) | |||||
| # Kmatrix[:] = np.nan | |||||
| Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= | |||||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| # for i, row in enumerate(idx): | |||||
| # for j, col in enumerate(idx): | |||||
| # Kmatrix[row, col] = Kmatrix_tmp[i, j] | |||||
| elif graph_kernel == 'structuralspkernel': | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, | |||||
| edge_label=edge_label, node_kernels=sub_kernels, | |||||
| edge_kernels=sub_kernels, | |||||
| parallel=parallel, n_jobs=multiprocessing.cpu_count(), | |||||
| verbose=verbose) | |||||
| elif graph_kernel == 'treeletkernel': | |||||
| pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
| # pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| sub_kernel=pkernel, parallel=parallel, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'weisfeilerlehmankernel': | |||||
| Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| height=4, base_kernel='subtree', parallel=None, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| # normalization | |||||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
| for i in range(len(Kmatrix)): | |||||
| for j in range(i, len(Kmatrix)): | |||||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| return Kmatrix | |||||
| def gram2distances(Kmatrix): | def gram2distances(Kmatrix): | ||||
| dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||||
| for i1 in range(len(Kmatrix)): | |||||
| for i2 in range(len(Kmatrix)): | |||||
| dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||||
| dmatrix = np.sqrt(dmatrix) | |||||
| return dmatrix | |||||
| dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||||
| for i1 in range(len(Kmatrix)): | |||||
| for i2 in range(len(Kmatrix)): | |||||
| dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||||
| dmatrix = np.sqrt(dmatrix) | |||||
| return dmatrix | |||||
| def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, | def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, | ||||
| gkernel=None, verbose=True): | |||||
| dis_mat = np.empty((len(Gn), len(Gn))) | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) | |||||
| for i in range(len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | |||||
| if dis < 0: | |||||
| if dis > -1e-10: | |||||
| dis = 0 | |||||
| else: | |||||
| raise ValueError('The distance is negative.') | |||||
| dis_mat[i, j] = np.sqrt(dis) | |||||
| dis_mat[j, i] = dis_mat[i, j] | |||||
| dis_max = np.max(np.max(dis_mat)) | |||||
| dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||||
| dis_mean = np.mean(np.mean(dis_mat)) | |||||
| return dis_mat, dis_max, dis_min, dis_mean | |||||
| gkernel=None, verbose=True): | |||||
| dis_mat = np.empty((len(Gn), len(Gn))) | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) | |||||
| for i in range(len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | |||||
| if dis < 0: | |||||
| if dis > -1e-10: | |||||
| dis = 0 | |||||
| else: | |||||
| raise ValueError('The distance is negative.') | |||||
| dis_mat[i, j] = np.sqrt(dis) | |||||
| dis_mat[j, i] = dis_mat[i, j] | |||||
| dis_max = np.max(np.max(dis_mat)) | |||||
| dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||||
| dis_mean = np.mean(np.mean(dis_mat)) | |||||
| return dis_mat, dis_max, dis_min, dis_mean | |||||
| def get_same_item_indices(ls): | def get_same_item_indices(ls): | ||||
| """Get the indices of the same items in a list. Return a dict keyed by items. | |||||
| """ | |||||
| idx_dict = {} | |||||
| for idx, item in enumerate(ls): | |||||
| if item in idx_dict: | |||||
| idx_dict[item].append(idx) | |||||
| else: | |||||
| idx_dict[item] = [idx] | |||||
| return idx_dict | |||||
| """Get the indices of the same items in a list. Return a dict keyed by items. | |||||
| """ | |||||
| idx_dict = {} | |||||
| for idx, item in enumerate(ls): | |||||
| if item in idx_dict: | |||||
| idx_dict[item].append(idx) | |||||
| else: | |||||
| idx_dict[item] = [idx] | |||||
| return idx_dict | |||||
| def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | ||||
| node_label=None, edge_label=None): | |||||
| dis_k_all = [] # distance between g_star and each graph. | |||||
| alpha = [1 / len(Gn)] * len(Gn) | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||||
| term3 = 0 | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||||
| dis_all.append(dtemp) | |||||
| node_label=None, edge_label=None): | |||||
| dis_k_all = [] # distance between g_star and each graph. | |||||
| alpha = [1 / len(Gn)] * len(Gn) | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||||
| term3 = 0 | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||||
| dis_all.append(dtemp) | |||||
| def normalize_distance_matrix(D): | def normalize_distance_matrix(D): | ||||
| max_value = np.amax(D) | |||||
| min_value = np.amin(D) | |||||
| return (D - min_value) / (max_value - min_value) | |||||
| max_value = np.amax(D) | |||||
| min_value = np.amin(D) | |||||
| return (D - min_value) / (max_value - min_value) | |||||
| @@ -15,5 +15,5 @@ __date__ = "November 2017" | |||||
| # from utils import graphfiles | # from utils import graphfiles | ||||
| # from utils import utils | # from utils import utils | ||||
| from gklearn.utils.dataset import Dataset | |||||
| from gklearn.utils.dataset import Dataset, split_dataset_by_target | |||||
| from gklearn.utils.timer import Timer | from gklearn.utils.timer import Timer | ||||
| @@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020 | |||||
| import numpy as np | import numpy as np | ||||
| import networkx as nx | import networkx as nx | ||||
| from gklearn.utils.graphfiles import loadDataset | from gklearn.utils.graphfiles import loadDataset | ||||
| import os | |||||
| class Dataset(object): | class Dataset(object): | ||||
| @@ -15,7 +16,7 @@ class Dataset(object): | |||||
| def __init__(self, filename=None, filename_y=None, extra_params=None): | def __init__(self, filename=None, filename_y=None, extra_params=None): | ||||
| if filename is None: | if filename is None: | ||||
| self.__graphs = None | self.__graphs = None | ||||
| self.__target = None | |||||
| self.__targets = None | |||||
| self.__node_labels = None | self.__node_labels = None | ||||
| self.__edge_labels = None | self.__edge_labels = None | ||||
| self.__node_attrs = None | self.__node_attrs = None | ||||
| @@ -50,33 +51,40 @@ class Dataset(object): | |||||
| def load_dataset(self, filename, filename_y=None, extra_params=None): | def load_dataset(self, filename, filename_y=None, extra_params=None): | ||||
| self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) | |||||
| self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) | |||||
| self.set_labels_attrs() | |||||
| def load_graphs(self, graphs, targets=None): | |||||
| self.__graphs = graphs | |||||
| self.__targets = targets | |||||
| self.set_labels_attrs() | self.set_labels_attrs() | ||||
| def load_predefined_dataset(self, ds_name): | def load_predefined_dataset(self, ds_name): | ||||
| current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||||
| if ds_name == 'Letter-high': # node non-symb | if ds_name == 'Letter-high': # node non-symb | ||||
| ds_file = '../../datasets/Letter-high/Letter-high_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'Letter-med': # node non-symb | elif ds_name == 'Letter-med': # node non-symb | ||||
| ds_file = '../../datasets/Letter-high/Letter-med_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'Letter-low': # node non-symb | elif ds_name == 'Letter-low': # node non-symb | ||||
| ds_file = '../../datasets/Letter-high/Letter-low_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'Fingerprint': | elif ds_name == 'Fingerprint': | ||||
| ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'SYNTHETIC': | elif ds_name == 'SYNTHETIC': | ||||
| pass | pass | ||||
| elif ds_name == 'SYNTHETICnew': | elif ds_name == 'SYNTHETICnew': | ||||
| ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'Synthie': | elif ds_name == 'Synthie': | ||||
| pass | pass | ||||
| elif ds_name == 'COIL-DEL': | elif ds_name == 'COIL-DEL': | ||||
| ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
| self.__graphs, self.__target = loadDataset(ds_file) | |||||
| ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
| self.__graphs, self.__targets = loadDataset(ds_file) | |||||
| elif ds_name == 'COIL-RAG': | elif ds_name == 'COIL-RAG': | ||||
| pass | pass | ||||
| elif ds_name == 'COLORS-3': | elif ds_name == 'COLORS-3': | ||||
| @@ -514,7 +522,7 @@ class Dataset(object): | |||||
| def __get_class_num(self): | def __get_class_num(self): | ||||
| return len(set(self.__target)) | |||||
| return len(set(self.__targets)) | |||||
| def __get_node_attr_dim(self): | def __get_node_attr_dim(self): | ||||
| @@ -529,6 +537,11 @@ class Dataset(object): | |||||
| def graphs(self): | def graphs(self): | ||||
| return self.__graphs | return self.__graphs | ||||
| @property | |||||
| def targets(self): | |||||
| return self.__targets | |||||
| @property | @property | ||||
| def node_labels(self): | def node_labels(self): | ||||
| @@ -547,4 +560,19 @@ class Dataset(object): | |||||
| @property | @property | ||||
| def edge_attrs(self): | def edge_attrs(self): | ||||
| return self.__edge_attrs | |||||
| return self.__edge_attrs | |||||
| def split_dataset_by_target(dataset): | |||||
| from gklearn.preimage.utils import get_same_item_indices | |||||
| graphs = dataset.graphs | |||||
| targets = dataset.targets | |||||
| datasets = [] | |||||
| idx_targets = get_same_item_indices(targets) | |||||
| for key, val in idx_targets.items(): | |||||
| sub_graphs = [graphs[i] for i in val] | |||||
| sub_dataset = Dataset() | |||||
| sub_dataset.load_graphs(sub_graphs, [key] * len(val)) | |||||
| datasets.append(sub_dataset) | |||||
| return datasets | |||||