| @@ -1,17 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Mar 19 18:17:38 2020 | |||
| @author: ljia | |||
| """ | |||
| from enum import Enum, auto | |||
| class AlgorithmState(Enum): | |||
| """can be used to specify the state of an algorithm. | |||
| """ | |||
| CALLED = auto # The algorithm has been called. | |||
| INITIALIZED = auto # The algorithm has been initialized. | |||
| CONVERGED = auto # The algorithm has converged. | |||
| TERMINATED = auto # The algorithm has terminated. | |||
| @@ -1,134 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Mar 20 11:09:04 2020 | |||
| @author: ljia | |||
| """ | |||
| import re | |||
| def convert_function(cpp_code): | |||
| # f_cpp = open('cpp_code.cpp', 'r') | |||
| # # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r') | |||
| # cpp_code = f_cpp.read() | |||
| python_code = cpp_code.replace('else if (', 'elif ') | |||
| python_code = python_code.replace('if (', 'if ') | |||
| python_code = python_code.replace('else {', 'else:') | |||
| python_code = python_code.replace(') {', ':') | |||
| python_code = python_code.replace(';\n', '\n') | |||
| python_code = re.sub('\n(.*)}\n', '\n\n', python_code) | |||
| # python_code = python_code.replace('}\n', '') | |||
| python_code = python_code.replace('throw', 'raise') | |||
| python_code = python_code.replace('error', 'Exception') | |||
| python_code = python_code.replace('"', '\'') | |||
| python_code = python_code.replace('\\\'', '"') | |||
| python_code = python_code.replace('try {', 'try:') | |||
| python_code = python_code.replace('true', 'True') | |||
| python_code = python_code.replace('false', 'False') | |||
| python_code = python_code.replace('catch (...', 'except') | |||
| # python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code) | |||
| return python_code | |||
| # # python_code = python_code.replace('}\n', '') | |||
| # python_code = python_code.replace('option.first', 'opt_name') | |||
| # python_code = python_code.replace('option.second', 'opt_val') | |||
| # python_code = python_code.replace('ged::Error', 'Exception') | |||
| # python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'') | |||
| # f_cpp.close() | |||
| # f_python = open('python_code.py', 'w') | |||
| # f_python.write(python_code) | |||
| # f_python.close() | |||
| def convert_function_comment(cpp_fun_cmt, param_types): | |||
| cpp_fun_cmt = cpp_fun_cmt.replace('\t', '') | |||
| cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ') | |||
| # split the input comment according to key words. | |||
| param_split = None | |||
| note = None | |||
| cmt_split = cpp_fun_cmt.split('@brief')[1] | |||
| brief = cmt_split | |||
| if '@param' in cmt_split: | |||
| cmt_split = cmt_split.split('@param') | |||
| brief = cmt_split[0] | |||
| param_split = cmt_split[1:] | |||
| if '@note' in cmt_split[-1]: | |||
| note_split = cmt_split[-1].split('@note') | |||
| if param_split is not None: | |||
| param_split.pop() | |||
| param_split.append(note_split[0]) | |||
| else: | |||
| brief = note_split[0] | |||
| note = note_split[1] | |||
| # get parameters. | |||
| if param_split is not None: | |||
| for idx, param in enumerate(param_split): | |||
| _, param_name, param_desc = param.split(' ', 2) | |||
| param_name = function_comment_strip(param_name, ' *\n\t/') | |||
| param_desc = function_comment_strip(param_desc, ' *\n\t/') | |||
| param_split[idx] = (param_name, param_desc) | |||
| # strip comments. | |||
| brief = function_comment_strip(brief, ' *\n\t/') | |||
| if note is not None: | |||
| note = function_comment_strip(note, ' *\n\t/') | |||
| # construct the Python function comment. | |||
| python_fun_cmt = '"""' | |||
| python_fun_cmt += brief + '\n' | |||
| if param_split is not None and len(param_split) > 0: | |||
| python_fun_cmt += '\nParameters\n----------' | |||
| for idx, param in enumerate(param_split): | |||
| python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx] | |||
| python_fun_cmt += '\n\t' + param[1] + '\n' | |||
| if note is not None: | |||
| python_fun_cmt += '\nNote\n----\n' + note + '\n' | |||
| python_fun_cmt += '"""' | |||
| return python_fun_cmt | |||
| def function_comment_strip(comment, bad_chars): | |||
| head_removed, tail_removed = False, False | |||
| while not head_removed or not tail_removed: | |||
| if comment[0] in bad_chars: | |||
| comment = comment[1:] | |||
| head_removed = False | |||
| else: | |||
| head_removed = True | |||
| if comment[-1] in bad_chars: | |||
| comment = comment[:-1] | |||
| tail_removed = False | |||
| else: | |||
| tail_removed = True | |||
| return comment | |||
| if __name__ == '__main__': | |||
| # python_code = convert_function(""" | |||
| # if (print_to_stdout_ == 2) { | |||
| # std::cout << "\n===========================================================\n"; | |||
| # std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n"; | |||
| # std::cout << "-----------------------------------------------------------\n"; | |||
| # } | |||
| # """) | |||
| python_fun_cmt = convert_function_comment(""" | |||
| /*! | |||
| * @brief Returns the sum of distances. | |||
| * @param[in] state The state of the estimator. | |||
| * @return The sum of distances of the median when the estimator was in the state @p state during the last call to run(). | |||
| */ | |||
| """, ['string', 'string']) | |||
| @@ -1,170 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Jan 9 11:54:32 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| import csv | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs | |||
| def find_best_k(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'treeletkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| ds_name = 'mono' | |||
| dir_output = 'results/test_find_best_k/' | |||
| repeats = 50 | |||
| k_list = range(2, 11) | |||
| fit_method = 'k-graphs' | |||
| # fitted on the whole dataset - treelet - mono | |||
| edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| print('\n--------- k =', k, '----------') | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx = random.sample(range(0, len(Gn)), k) | |||
| print('median set: ', median_set_idx) | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
| fit_method='k-graphs', | |||
| edit_costs=edit_costs, | |||
| group_min=median_set_idx, | |||
| parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, | |||
| median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # write result summary. | |||
| sod_sm_mean = np.mean(sod_sm_list) | |||
| sod_gm_mean = np.mean(sod_gm_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| return | |||
| def getRelations(sign): | |||
| if sign == -1: | |||
| return 'better' | |||
| elif sign == 0: | |||
| return 'same' | |||
| elif sign == 1: | |||
| return 'worse' | |||
| if __name__ == '__main__': | |||
| find_best_k() | |||
| @@ -1,430 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Oct 16 14:20:06 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from itertools import combinations_with_replacement, combinations | |||
| import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| import time | |||
| import random | |||
| import sys | |||
| from scipy import optimize | |||
| from scipy.optimize import minimize | |||
| import cvxpy as cp | |||
| from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic | |||
| from gklearn.preimage.utils import kernel_distance_matrix | |||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||
| 'method': 'IPFP', 'stabilizer': None}, | |||
| init_costs=[3, 3, 1, 3, 3, 1], | |||
| dataset='monoterpenoides', Kmatrix=None, | |||
| parallel=True): | |||
| # dataset = dataset.lower() | |||
| # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | |||
| # random.seed(1) | |||
| # cost_rdm = random.sample(range(1, 10), 6) | |||
| # init_costs = cost_rdm + [0] | |||
| # init_costs = cost_rdm | |||
| # init_costs = [3, 3, 1, 3, 3, 1] | |||
| # init_costs = [i * 0.01 for i in cost_rdm] + [0] | |||
| # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||
| # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||
| # init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||
| # idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||
| # compute distances in feature space. | |||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| Kmatrix=Kmatrix, gkernel=gkernel) | |||
| dis_k_vec = [] | |||
| for i in range(len(dis_k_mat)): | |||
| # for j in range(i, len(dis_k_mat)): | |||
| for j in range(i + 1, len(dis_k_mat)): | |||
| dis_k_vec.append(dis_k_mat[i, j]) | |||
| dis_k_vec = np.array(dis_k_vec) | |||
| # init ged. | |||
| print('\ninitial:') | |||
| time0 = time.time() | |||
| params_ged['dataset'] = dataset | |||
| params_ged['edit_cost_constant'] = init_costs | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| parallel=parallel) | |||
| residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | |||
| time_list = [time.time() - time0] | |||
| edit_cost_list = [init_costs] | |||
| nb_cost_mat = np.array(n_edit_operations) | |||
| nb_cost_mat_list = [nb_cost_mat] | |||
| print('edit_costs:', init_costs) | |||
| print('residual_list:', residual_list) | |||
| for itr in range(itr_max): | |||
| print('\niteration', itr) | |||
| time0 = time.time() | |||
| # "fit" geds to distances in feature space by tuning edit costs using the | |||
| # Least Squares Method. | |||
| np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', | |||
| nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, | |||
| n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, | |||
| ged_mat=ged_mat) | |||
| edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, | |||
| dataset=dataset, cost=params_ged['cost']) | |||
| for i in range(len(edit_costs_new)): | |||
| if -1e-9 <= edit_costs_new[i] <= 1e-9: | |||
| edit_costs_new[i] = 0 | |||
| if edit_costs_new[i] < 0: | |||
| raise ValueError('The edit cost is negative.') | |||
| # for i in range(len(edit_costs_new)): | |||
| # if edit_costs_new[i] < 0: | |||
| # edit_costs_new[i] = 0 | |||
| # compute new GEDs and numbers of edit operations. | |||
| params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) | |||
| ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| parallel=parallel) | |||
| residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | |||
| time_list.append(time.time() - time0) | |||
| edit_cost_list.append(edit_costs_new) | |||
| nb_cost_mat = np.array(n_edit_operations) | |||
| nb_cost_mat_list.append(nb_cost_mat) | |||
| print('edit_costs:', edit_costs_new) | |||
| print('residual_list:', residual_list) | |||
| return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||
| time_list, nb_cost_mat_list | |||
| def compute_geds(Gn, params_ged, parallel=False): | |||
| edit_cost_name = params_ged['cost'] | |||
| if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2': | |||
| get_nb_eo = get_nb_edit_operations_letter | |||
| elif edit_cost_name == 'NON_SYMBOLIC': | |||
| get_nb_eo = get_nb_edit_operations_nonsymbolic | |||
| else: | |||
| get_nb_eo = get_nb_edit_operations | |||
| ged_mat = np.zeros((len(Gn), len(Gn))) | |||
| if parallel: | |||
| # print('parallel') | |||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
| len_itr = int(len(Gn) * (len(Gn) - 1) / 2) | |||
| ged_vec = [0 for i in range(len_itr)] | |||
| n_edit_operations = [0 for i in range(len_itr)] | |||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| itr = combinations(range(0, len(Gn)), 2) | |||
| n_jobs = multiprocessing.cpu_count() | |||
| if len_itr < 100 * n_jobs: | |||
| chunksize = int(len_itr / n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| def init_worker(gn_toshare): | |||
| global G_gn | |||
| G_gn = gn_toshare | |||
| do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) | |||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | |||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
| desc='computing GEDs', file=sys.stdout) | |||
| # iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
| for i, j, dis, n_eo_tmp in iterator: | |||
| idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) | |||
| ged_vec[idx_itr] = dis | |||
| ged_mat[i][j] = dis | |||
| ged_mat[j][i] = dis | |||
| n_edit_operations[idx_itr] = n_eo_tmp | |||
| # print('\n-------------------------------------------') | |||
| # print(i, j, idx_itr, dis) | |||
| pool.close() | |||
| pool.join() | |||
| else: | |||
| ged_vec = [] | |||
| n_edit_operations = [] | |||
| for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||
| # for i in range(len(Gn)): | |||
| for j in range(i + 1, len(Gn)): | |||
| dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) | |||
| ged_vec.append(dis) | |||
| ged_mat[i][j] = dis | |||
| ged_mat[j][i] = dis | |||
| n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) | |||
| n_edit_operations.append(n_eo_tmp) | |||
| return ged_vec, ged_mat, n_edit_operations | |||
| def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): | |||
| i = itr[0] | |||
| j = itr[1] | |||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) | |||
| return i, j, dis, n_eo_tmp | |||
| def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): | |||
| dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | |||
| n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] | |||
| return dis, n_eo_tmp | |||
| def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', | |||
| cost='CONSTANT', rw_constraints='inequality'): | |||
| # if dataset == 'Letter-high': | |||
| if cost == 'LETTER': | |||
| pass | |||
| # # method 1: set alpha automatically, just tune c_vir and c_eir by | |||
| # # LMS using cvxpy. | |||
| # alpha = 0.5 | |||
| # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) | |||
| ## if np.count_nonzero(nb_cost_mat[:,4]) == 0: | |||
| ## alpha = 0.75 | |||
| ## else: | |||
| ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) | |||
| ## alpha = alpha * 0.99 | |||
| # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) | |||
| # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) | |||
| # nb_cost_mat_new = np.column_stack((param_vir, param_eir)) | |||
| # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] | |||
| # | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| # prob = cp.Problem(cp.Minimize(cost), constraints) | |||
| # prob.solve() | |||
| # edit_costs_new = x.value | |||
| # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) | |||
| # residual = np.sqrt(prob.value) | |||
| # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by | |||
| # # scipy.optimize.minimize. | |||
| # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||
| # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||
| # w2 = nb_cost_mat[:,3] | |||
| # w3 = dis_k_vec | |||
| # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||
| # + w2 * x[2] - w3 * x[3]) ** 2) | |||
| # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) | |||
| # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) | |||
| # edit_costs_new = res.x[0:3] | |||
| # residual = res.fun | |||
| # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. | |||
| # # method 4: tune c_vir, c_eir and alpha by QP function | |||
| # # scipy.optimize.least_squares. An initial guess is required. | |||
| # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||
| # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||
| # w2 = nb_cost_mat[:,3] | |||
| # w3 = dis_k_vec | |||
| # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||
| # + w2 * x[2] - w3 * x[3]) ** 2 | |||
| # res = optimize.root(func, [0.9, 1.7, 0.75, 100]) | |||
| # edit_costs_new = res.x | |||
| # residual = None | |||
| elif cost == 'LETTER2': | |||
| # # 1. if c_vi != c_vr, c_ei != c_er. | |||
| # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| ## # 1.1 no constraints. | |||
| ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| # # 1.2 c_vs <= c_vi + c_vr. | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| ## # 2. if c_vi == c_vr, c_ei == c_er. | |||
| ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] | |||
| ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] | |||
| ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] | |||
| ## x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| ## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| ## # 2.1 no constraints. | |||
| ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| ### # 2.2 c_vs <= c_vi + c_vr. | |||
| ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] | |||
| # | |||
| # prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| # prob.solve() | |||
| # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||
| # edit_costs_new = np.array(edit_costs_new) | |||
| # residual = np.sqrt(prob.value) | |||
| if rw_constraints == 'inequality': | |||
| # c_vs <= c_vi + c_vr. | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| try: | |||
| prob.solve(verbose=True) | |||
| except MemoryError as error0: | |||
| print('\nUsing solver "OSQP" caused a memory error.') | |||
| print('the original error message is\n', error0) | |||
| print('solver status: ', prob.status) | |||
| print('trying solver "CVXOPT" instead...\n') | |||
| try: | |||
| prob.solve(solver=cp.CVXOPT, verbose=True) | |||
| except Exception as error1: | |||
| print('\nAn error occured when using solver "CVXOPT".') | |||
| print('the original error message is\n', error1) | |||
| print('solver status: ', prob.status) | |||
| print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') | |||
| prob.solve(solver=cp.MOSEK, verbose=True) | |||
| else: | |||
| print('solver status: ', prob.status) | |||
| else: | |||
| print('solver status: ', prob.status) | |||
| print() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| elif rw_constraints == '2constraints': | |||
| # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| elif rw_constraints == 'no-constraint': | |||
| # no constraint. | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| # elif method == 'inequality_modified': | |||
| # # c_vs <= c_vi + c_vr. | |||
| # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| # prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| # prob.solve() | |||
| # # use same costs for insertion and removal rather than the fitted costs. | |||
| # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||
| # edit_costs_new = np.array(edit_costs_new) | |||
| # residual = np.sqrt(prob.value) | |||
| elif cost == 'NON_SYMBOLIC': | |||
| is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) | |||
| is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) | |||
| if dataset == 'SYNTHETICnew': | |||
| # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] | |||
| nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] | |||
| # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] | |||
| constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([0.0, 1.0, -1.0]).T@x == 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| # print(x.value) | |||
| edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, | |||
| np.array([0.0]))) | |||
| residual = np.sqrt(prob.value) | |||
| elif rw_constraints == 'inequality': | |||
| # c_vs <= c_vi + c_vr. | |||
| if is_n_attr and is_e_attr: | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| elif is_n_attr and not is_e_attr: | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| print(x.value) | |||
| edit_costs_new = np.concatenate((x.value, np.array([0.0]))) | |||
| residual = np.sqrt(prob.value) | |||
| elif not is_n_attr and is_e_attr: | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) | |||
| residual = np.sqrt(prob.value) | |||
| else: | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), | |||
| x.value[2:], np.array([0.0]))) | |||
| residual = np.sqrt(prob.value) | |||
| else: | |||
| # # method 1: simple least square method. | |||
| # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | |||
| # rcond=None) | |||
| # # method 2: least square method with x_i >= 0. | |||
| # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | |||
| # method 3: solve as a quadratic program with constraints. | |||
| # P = np.dot(nb_cost_mat.T, nb_cost_mat) | |||
| # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | |||
| # G = -1 * np.identity(nb_cost_mat.shape[1]) | |||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | |||
| # A = np.array([1 for i in range(nb_cost_mat.shape[1])]) | |||
| # b = 1 | |||
| # x = cp.Variable(nb_cost_mat.shape[1]) | |||
| # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), | |||
| # [G@x <= h]) | |||
| # prob.solve() | |||
| # edit_costs_new = x.value | |||
| # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) | |||
| # G = -1 * np.identity(nb_cost_mat.shape[1]) | |||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | |||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||
| constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| # method 4: | |||
| return edit_costs_new, residual | |||
| if __name__ == '__main__': | |||
| print('check test_fitDistance.py') | |||
| @@ -1,467 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Oct 17 18:44:59 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| from tqdm import tqdm | |||
| import sys | |||
| import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| #from gedlibpy_linlin import librariesImport, gedlibpy | |||
| from gklearn.gedlib import librariesImport, gedlibpy | |||
| def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | |||
| """ | |||
| Compute GED for 2 graphs. | |||
| """ | |||
| # dataset = dataset.lower() | |||
| if lib == 'gedlibpy': | |||
| gedlibpy.restart_env() | |||
| gedlibpy.add_nx_graph(convertGraph(g1, cost), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g2, cost), "") | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method(method, algo_options) | |||
| gedlibpy.init_method() | |||
| g = listID[0] | |||
| h = listID[1] | |||
| if stabilizer is None: | |||
| gedlibpy.run_method(g, h) | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| upper = gedlibpy.get_upper_bound(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| elif stabilizer == 'mean': | |||
| # @todo: to be finished... | |||
| upper_list = [np.inf] * repeat | |||
| for itr in range(repeat): | |||
| gedlibpy.run_method(g, h) | |||
| upper_list[itr] = gedlibpy.get_upper_bound(g, h) | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| upper = np.mean(upper_list) | |||
| elif stabilizer == 'median': | |||
| if repeat % 2 == 0: | |||
| repeat += 1 | |||
| upper_list = [np.inf] * repeat | |||
| pi_forward_list = [0] * repeat | |||
| pi_backward_list = [0] * repeat | |||
| for itr in range(repeat): | |||
| gedlibpy.run_method(g, h) | |||
| upper_list[itr] = gedlibpy.get_upper_bound(g, h) | |||
| pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) | |||
| pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| upper = np.median(upper_list) | |||
| idx_median = upper_list.index(upper) | |||
| pi_forward = pi_forward_list[idx_median] | |||
| pi_backward = pi_backward_list[idx_median] | |||
| elif stabilizer == 'min': | |||
| upper = np.inf | |||
| for itr in range(repeat): | |||
| gedlibpy.run_method(g, h) | |||
| upper_tmp = gedlibpy.get_upper_bound(g, h) | |||
| if upper_tmp < upper: | |||
| upper = upper_tmp | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| if upper == 0: | |||
| break | |||
| elif stabilizer == 'max': | |||
| upper = 0 | |||
| for itr in range(repeat): | |||
| gedlibpy.run_method(g, h) | |||
| upper_tmp = gedlibpy.get_upper_bound(g, h) | |||
| if upper_tmp > upper: | |||
| upper = upper_tmp | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| elif stabilizer == 'gaussian': | |||
| pass | |||
| dis = upper | |||
| elif lib == 'gedlib-bash': | |||
| import time | |||
| import random | |||
| import os | |||
| from gklearn.utils.graphfiles import saveDataset | |||
| tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' | |||
| if not os.path.exists(tmp_dir): | |||
| os.makedirs(tmp_dir) | |||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||
| xparams = {'method': 'gedlib', 'graph_dir': fn_collection} | |||
| saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', | |||
| filename=fn_collection, xparams=xparams) | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||
| command += 'export LD_LIBRARY_PATH\n' | |||
| command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' | |||
| command += './ged_for_python_bash monoterpenoides ' + fn_collection \ | |||
| + ' \'' + algo_options + '\' ' | |||
| for ec in edit_cost_constant: | |||
| command += str(ec) + ' ' | |||
| # output = os.system(command) | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| # print(output) | |||
| dis = float(output[0].strip()) | |||
| runtime = float(output[1].strip()) | |||
| size_forward = int(output[2].strip()) | |||
| pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] | |||
| pi_backward = [int(item.strip()) for item in output[3+size_forward:]] | |||
| # print(dis) | |||
| # print(runtime) | |||
| # print(size_forward) | |||
| # print(pi_forward) | |||
| # print(pi_backward) | |||
| # make the map label correct (label remove map as np.inf) | |||
| nodes1 = [n for n in g1.nodes()] | |||
| nodes2 = [n for n in g2.nodes()] | |||
| nb1 = nx.number_of_nodes(g1) | |||
| nb2 = nx.number_of_nodes(g2) | |||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
| # print(pi_forward) | |||
| return dis, pi_forward, pi_backward | |||
| def convertGraph(G, cost): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| if cost == 'LETTER' or cost == 'LETTER2': | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||
| y=str(attrs['attributes'][1])) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| elif cost == 'NON_SYMBOLIC': | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd)) | |||
| for a_name in G.graph['node_attrs']: | |||
| G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| for a_name in G.graph['edge_attrs']: | |||
| G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) | |||
| else: | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| # G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| edit_cost_constant=[], stabilizer='min', repeat=50): | |||
| """ | |||
| Compute GEDs for a group of graphs. | |||
| """ | |||
| if lib == 'gedlibpy': | |||
| def convertGraph(G): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| gedlibpy.restart_env() | |||
| gedlibpy.add_nx_graph(convertGraph(g1), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g2), "") | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method(method, "") | |||
| gedlibpy.init_method() | |||
| g = listID[0] | |||
| h = listID[1] | |||
| if stabilizer is None: | |||
| gedlibpy.run_method(g, h) | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| upper = gedlibpy.get_upper_bound(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| elif stabilizer == 'min': | |||
| upper = np.inf | |||
| for itr in range(repeat): | |||
| gedlibpy.run_method(g, h) | |||
| upper_tmp = gedlibpy.get_upper_bound(g, h) | |||
| if upper_tmp < upper: | |||
| upper = upper_tmp | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| lower = gedlibpy.get_lower_bound(g, h) | |||
| if upper == 0: | |||
| break | |||
| dis = upper | |||
| # make the map label correct (label remove map as np.inf) | |||
| nodes1 = [n for n in g1.nodes()] | |||
| nodes2 = [n for n in g2.nodes()] | |||
| nb1 = nx.number_of_nodes(g1) | |||
| nb2 = nx.number_of_nodes(g2) | |||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
| return dis, pi_forward, pi_backward | |||
| def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | |||
| 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | |||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', | |||
| 'stabilizer': None}, parallel=False): | |||
| if parallel: | |||
| len_itr = int(len(Gn)) | |||
| pi_forward_list = [[] for i in range(len_itr)] | |||
| dis_list = [0 for i in range(len_itr)] | |||
| itr = range(0, len_itr) | |||
| n_jobs = multiprocessing.cpu_count() | |||
| if len_itr < 100 * n_jobs: | |||
| chunksize = int(len_itr / n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| def init_worker(gn_toshare, gn_median_toshare): | |||
| global G_gn, G_gn_median | |||
| G_gn = gn_toshare | |||
| G_gn_median = gn_median_toshare | |||
| do_partial = partial(_compute_ged_median, params_ged) | |||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) | |||
| if verbose: | |||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
| desc='computing GEDs', file=sys.stdout) | |||
| else: | |||
| iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
| for i, dis_sum, pi_forward in iterator: | |||
| pi_forward_list[i] = pi_forward | |||
| dis_list[i] = dis_sum | |||
| # print('\n-------------------------------------------') | |||
| # print(i, j, idx_itr, dis) | |||
| pool.close() | |||
| pool.join() | |||
| else: | |||
| dis_list = [] | |||
| pi_forward_list = [] | |||
| for idx, G in tqdm(enumerate(Gn), desc='computing median distances', | |||
| file=sys.stdout) if verbose else enumerate(Gn): | |||
| dis_sum = 0 | |||
| pi_forward_list.append([]) | |||
| for G_p in Gn_median: | |||
| dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, | |||
| **params_ged) | |||
| pi_forward_list[idx].append(pi_tmp_forward) | |||
| dis_sum += dis_tmp | |||
| dis_list.append(dis_sum) | |||
| return dis_list, pi_forward_list | |||
| def _compute_ged_median(params_ged, itr): | |||
| # print(itr) | |||
| dis_sum = 0 | |||
| pi_forward = [] | |||
| for G_p in G_gn_median: | |||
| dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, | |||
| **params_ged) | |||
| pi_forward.append(pi_tmp_forward) | |||
| dis_sum += dis_tmp | |||
| return itr, dis_sum, pi_forward | |||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map): | |||
| """Compute the number of each edit operations. | |||
| """ | |||
| n_vi = 0 | |||
| n_vr = 0 | |||
| n_vs = 0 | |||
| n_ei = 0 | |||
| n_er = 0 | |||
| n_es = 0 | |||
| nodes1 = [n for n in g1.nodes()] | |||
| for i, map_i in enumerate(forward_map): | |||
| if map_i == np.inf: | |||
| n_vr += 1 | |||
| elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: | |||
| n_vs += 1 | |||
| for map_i in backward_map: | |||
| if map_i == np.inf: | |||
| n_vi += 1 | |||
| # idx_nodes1 = range(0, len(node1)) | |||
| edges1 = [e for e in g1.edges()] | |||
| nb_edges2_cnted = 0 | |||
| for n1, n2 in edges1: | |||
| idx1 = nodes1.index(n1) | |||
| idx2 = nodes1.index(n2) | |||
| # one of the nodes is removed, thus the edge is removed. | |||
| if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | |||
| n_er += 1 | |||
| # corresponding edge is in g2. | |||
| elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): | |||
| nb_edges2_cnted += 1 | |||
| # edge labels are different. | |||
| if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ | |||
| != g1.edges[(n1, n2)]['bond_type']: | |||
| n_es += 1 | |||
| elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): | |||
| nb_edges2_cnted += 1 | |||
| # edge labels are different. | |||
| if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ | |||
| != g1.edges[(n1, n2)]['bond_type']: | |||
| n_es += 1 | |||
| # corresponding nodes are in g2, however the edge is removed. | |||
| else: | |||
| n_er += 1 | |||
| n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
| return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||
| def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||
| """Compute the number of each edit operations. | |||
| """ | |||
| n_vi = 0 | |||
| n_vr = 0 | |||
| n_vs = 0 | |||
| sod_vs = 0 | |||
| n_ei = 0 | |||
| n_er = 0 | |||
| nodes1 = [n for n in g1.nodes()] | |||
| for i, map_i in enumerate(forward_map): | |||
| if map_i == np.inf: | |||
| n_vr += 1 | |||
| else: | |||
| n_vs += 1 | |||
| diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x']) | |||
| diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y']) | |||
| sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) | |||
| for map_i in backward_map: | |||
| if map_i == np.inf: | |||
| n_vi += 1 | |||
| # idx_nodes1 = range(0, len(node1)) | |||
| edges1 = [e for e in g1.edges()] | |||
| nb_edges2_cnted = 0 | |||
| for n1, n2 in edges1: | |||
| idx1 = nodes1.index(n1) | |||
| idx2 = nodes1.index(n2) | |||
| # one of the nodes is removed, thus the edge is removed. | |||
| if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | |||
| n_er += 1 | |||
| # corresponding edge is in g2. Edge label is not considered. | |||
| elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ | |||
| (forward_map[idx2], forward_map[idx1]) in g2.edges(): | |||
| nb_edges2_cnted += 1 | |||
| # corresponding nodes are in g2, however the edge is removed. | |||
| else: | |||
| n_er += 1 | |||
| n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
| return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er | |||
| def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): | |||
| """Compute the number of each edit operations. | |||
| """ | |||
| n_vi = 0 | |||
| n_vr = 0 | |||
| n_vs = 0 | |||
| sod_vs = 0 | |||
| n_ei = 0 | |||
| n_er = 0 | |||
| n_es = 0 | |||
| sod_es = 0 | |||
| nodes1 = [n for n in g1.nodes()] | |||
| for i, map_i in enumerate(forward_map): | |||
| if map_i == np.inf: | |||
| n_vr += 1 | |||
| else: | |||
| n_vs += 1 | |||
| sum_squares = 0 | |||
| for a_name in g1.graph['node_attrs']: | |||
| diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) | |||
| sum_squares += np.square(diff) | |||
| sod_vs += np.sqrt(sum_squares) | |||
| for map_i in backward_map: | |||
| if map_i == np.inf: | |||
| n_vi += 1 | |||
| # idx_nodes1 = range(0, len(node1)) | |||
| edges1 = [e for e in g1.edges()] | |||
| for n1, n2 in edges1: | |||
| idx1 = nodes1.index(n1) | |||
| idx2 = nodes1.index(n2) | |||
| n1_g2 = forward_map[idx1] | |||
| n2_g2 = forward_map[idx2] | |||
| # one of the nodes is removed, thus the edge is removed. | |||
| if n1_g2 == np.inf or n2_g2 == np.inf: | |||
| n_er += 1 | |||
| # corresponding edge is in g2. | |||
| elif (n1_g2, n2_g2) in g2.edges(): | |||
| n_es += 1 | |||
| sum_squares = 0 | |||
| for a_name in g1.graph['edge_attrs']: | |||
| diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name]) | |||
| sum_squares += np.square(diff) | |||
| sod_es += np.sqrt(sum_squares) | |||
| elif (n2_g2, n1_g2) in g2.edges(): | |||
| n_es += 1 | |||
| sum_squares = 0 | |||
| for a_name in g1.graph['edge_attrs']: | |||
| diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name]) | |||
| sum_squares += np.square(diff) | |||
| sod_es += np.sqrt(sum_squares) | |||
| # corresponding nodes are in g2, however the edge is removed. | |||
| else: | |||
| n_er += 1 | |||
| n_ei = nx.number_of_edges(g2) - n_es | |||
| return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | |||
| if __name__ == '__main__': | |||
| print('check test_ged.py') | |||
| @@ -1,775 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Apr 26 11:49:12 2019 | |||
| Iterative alternate minimizations using GED. | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| import networkx as nx | |||
| from tqdm import tqdm | |||
| from gklearn.utils.graphdataset import get_dataset_attributes | |||
| from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels | |||
| from gklearn.preimage.ged import GED, ged_median | |||
| def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||
| epsilon=0.001, node_label='atom', edge_label='bond_type', | |||
| connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | |||
| allBestEdges=False, allBestOutput=False, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | |||
| 'edit_cost_constant': [], 'stabilizer': None, | |||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): | |||
| """See my name, then you know what I do. | |||
| """ | |||
| # Gn_median = Gn_median[0:10] | |||
| # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] | |||
| node_ir = np.inf # corresponding to the node remove and insertion. | |||
| label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. | |||
| ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, | |||
| attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], | |||
| edge_label=edge_label) | |||
| node_label_set = get_node_labels(Gn_median, node_label) | |||
| edge_label_set = get_edge_labels(Gn_median, edge_label) | |||
| def generate_graph(G, pi_p_forward): | |||
| G_new_list = [G.copy()] # all "best" graphs generated in this iteration. | |||
| # nx.draw_networkx(G) | |||
| # import matplotlib.pyplot as plt | |||
| # plt.show() | |||
| # print(pi_p_forward) | |||
| # update vertex labels. | |||
| # pre-compute h_i0 for each label. | |||
| # for label in get_node_labels(Gn, node_label): | |||
| # print(label) | |||
| # for nd in G.nodes(data=True): | |||
| # pass | |||
| if not ds_attrs['node_attr_dim']: # labels are symbolic | |||
| for ndi, (nd, _) in enumerate(G.nodes(data=True)): | |||
| h_i0_list = [] | |||
| label_list = [] | |||
| for label in node_label_set: | |||
| h_i0 = 0 | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][ndi] | |||
| if pi_i != node_ir and g.nodes[pi_i][node_label] == label: | |||
| h_i0 += 1 | |||
| h_i0_list.append(h_i0) | |||
| label_list.append(label) | |||
| # case when the node is to be removed. | |||
| if removeNodes: | |||
| h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][ndi] | |||
| if pi_i == node_ir: | |||
| h_i0_remove += 1 | |||
| h_i0_list.append(h_i0_remove) | |||
| label_list.append(label_r) | |||
| # get the best labels. | |||
| idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||
| if allBestNodes: # choose all best graphs. | |||
| nlabel_best = [label_list[idx] for idx in idx_max] | |||
| # generate "best" graphs with regard to "best" node labels. | |||
| G_new_list_nd = [] | |||
| for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. | |||
| for nl in nlabel_best: | |||
| g_tmp = g.copy() | |||
| if nl == label_r: | |||
| g_tmp.remove_node(nd) | |||
| else: | |||
| g_tmp.nodes[nd][node_label] = nl | |||
| G_new_list_nd.append(g_tmp) | |||
| # nx.draw_networkx(g_tmp) | |||
| # import matplotlib.pyplot as plt | |||
| # plt.show() | |||
| # print(g_tmp.nodes(data=True)) | |||
| # print(g_tmp.edges(data=True)) | |||
| G_new_list = [ggg.copy() for ggg in G_new_list_nd] | |||
| else: | |||
| # choose one of the best randomly. | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| best_label = label_list[idx_max[idx_rdm]] | |||
| h_i0_max = h_i0_list[idx_max[idx_rdm]] | |||
| g_new = G_new_list[0] | |||
| if best_label == label_r: | |||
| g_new.remove_node(nd) | |||
| else: | |||
| g_new.nodes[nd][node_label] = best_label | |||
| G_new_list = [g_new] | |||
| else: # labels are non-symbolic | |||
| for ndi, (nd, _) in enumerate(G.nodes(data=True)): | |||
| Si_norm = 0 | |||
| phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][ndi] | |||
| if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||
| Si_norm += 1 | |||
| phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||
| phi_i_bar /= Si_norm | |||
| G_new_list[0].nodes[nd]['attributes'] = phi_i_bar | |||
| # for g in G_new_list: | |||
| # import matplotlib.pyplot as plt | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # update edge labels and adjacency matrix. | |||
| if ds_attrs['edge_labeled']: | |||
| G_new_list_edge = [] | |||
| for g_new in G_new_list: | |||
| nd_list = [n for n in g_new.nodes()] | |||
| g_tmp_list = [g_new.copy()] | |||
| for nd1i in range(nx.number_of_nodes(g_new)): | |||
| nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes | |||
| for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)): | |||
| nd2 = nd_list[nd2i] | |||
| # for nd1, nd2, _ in g_new.edges(data=True): | |||
| h_ij0_list = [] | |||
| label_list = [] | |||
| for label in edge_label_set: | |||
| h_ij0 = 0 | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][nd1i] | |||
| pi_j = pi_p_forward[idx][nd2i] | |||
| h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||
| g.has_edge(pi_i, pi_j) and | |||
| g.edges[pi_i, pi_j][edge_label] == label) | |||
| h_ij0 += h_ij0_p | |||
| h_ij0_list.append(h_ij0) | |||
| label_list.append(label) | |||
| # get the best labels. | |||
| idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||
| if allBestEdges: # choose all best graphs. | |||
| elabel_best = [label_list[idx] for idx in idx_max] | |||
| h_ij0_max = [h_ij0_list[idx] for idx in idx_max] | |||
| # generate "best" graphs with regard to "best" node labels. | |||
| G_new_list_ed = [] | |||
| for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. | |||
| for idxl, el in enumerate(elabel_best): | |||
| g_tmp_copy = g_tmp.copy() | |||
| # check whether a_ij is 0 or 1. | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][nd1i] | |||
| pi_j = pi_p_forward[idx][nd2i] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and \ | |||
| g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \ | |||
| sij_norm * (1 - (c_er + c_ei) / c_es): | |||
| if not g_tmp_copy.has_edge(nd1, nd2): | |||
| g_tmp_copy.add_edge(nd1, nd2) | |||
| g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl] | |||
| else: | |||
| if g_tmp_copy.has_edge(nd1, nd2): | |||
| g_tmp_copy.remove_edge(nd1, nd2) | |||
| G_new_list_ed.append(g_tmp_copy) | |||
| g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] | |||
| else: # choose one of the best randomly. | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| best_label = label_list[idx_max[idx_rdm]] | |||
| h_ij0_max = h_ij0_list[idx_max[idx_rdm]] | |||
| # check whether a_ij is 0 or 1. | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][nd1i] | |||
| pi_j = pi_p_forward[idx][nd2i] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||
| if not g_new.has_edge(nd1, nd2): | |||
| g_new.add_edge(nd1, nd2) | |||
| g_new.edges[nd1, nd2][edge_label] = best_label | |||
| else: | |||
| # elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||
| if g_new.has_edge(nd1, nd2): | |||
| g_new.remove_edge(nd1, nd2) | |||
| g_tmp_list = [g_new] | |||
| G_new_list_edge += g_tmp_list | |||
| G_new_list = [ggg.copy() for ggg in G_new_list_edge] | |||
| else: # if edges are unlabeled | |||
| # @todo: is this even right? G or g_tmp? check if the new one is right | |||
| # @todo: works only for undirected graphs. | |||
| for g_tmp in G_new_list: | |||
| nd_list = [n for n in g_tmp.nodes()] | |||
| for nd1i in range(nx.number_of_nodes(g_tmp)): | |||
| nd1 = nd_list[nd1i] | |||
| for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)): | |||
| nd2 = nd_list[nd2i] | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn_median): | |||
| pi_i = pi_p_forward[idx][nd1i] | |||
| pi_j = pi_p_forward[idx][nd2i] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): | |||
| # @todo: should we consider if nd1 and nd2 in g_tmp? | |||
| # or just add the edge anyway? | |||
| if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ | |||
| and not g_tmp.has_edge(nd1, nd2): | |||
| g_tmp.add_edge(nd1, nd2) | |||
| else: # @todo: which to use? | |||
| # elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): | |||
| if g_tmp.has_edge(nd1, nd2): | |||
| g_tmp.remove_edge(nd1, nd2) | |||
| # do not change anything when equal. | |||
| # for i, g in enumerate(G_new_list): | |||
| # import matplotlib.pyplot as plt | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # # find the best graph generated in this iteration and update pi_p. | |||
| # @todo: should we update all graphs generated or just the best ones? | |||
| dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, | |||
| params_ged=params_ged) | |||
| # @todo: should we remove the identical and connectivity check? | |||
| # Don't know which is faster. | |||
| if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: | |||
| G_new_list, idx_list = remove_duplicates(G_new_list) | |||
| pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||
| dis_list = [dis_list[idx] for idx in idx_list] | |||
| # if connected == True: | |||
| # G_new_list, idx_list = remove_disconnected(G_new_list) | |||
| # pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||
| # idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() | |||
| # dis_min = dis_list[idx_min_tmp_list[0]] | |||
| # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] | |||
| # G_new_list = [G_new_list[idx] for idx in idx_min_list] | |||
| # for g in G_new_list: | |||
| # import matplotlib.pyplot as plt | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| return G_new_list, pi_forward_list, dis_list | |||
| def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): | |||
| idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() | |||
| dis_min = dis_all[idx_min_list[0]] | |||
| pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] | |||
| G_min_list = [Gn_candidate[idx] for idx in idx_min_list] | |||
| return G_min_list, pi_forward_min_list, dis_min | |||
| def iteration_proc(G, pi_p_forward, cur_sod): | |||
| G_list = [G] | |||
| pi_forward_list = [pi_p_forward] | |||
| old_sod = cur_sod * 2 | |||
| sod_list = [cur_sod] | |||
| dis_list = [cur_sod] | |||
| # iterations. | |||
| itr = 0 | |||
| # @todo: what if difference == 0? | |||
| # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or | |||
| # np.abs(old_sod - cur_sod) == 0): | |||
| while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: | |||
| # while itr < ite_max: | |||
| # for itr in range(0, 5): # the convergence condition? | |||
| print('itr_iam is', itr) | |||
| G_new_list = [] | |||
| pi_forward_new_list = [] | |||
| dis_new_list = [] | |||
| for idx, g in enumerate(G_list): | |||
| # label_set = get_node_labels(Gn_median + [g], node_label) | |||
| G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( | |||
| g, pi_forward_list[idx]) | |||
| G_new_list += G_tmp_list | |||
| pi_forward_new_list += pi_forward_tmp_list | |||
| dis_new_list += dis_tmp_list | |||
| # @todo: need to remove duplicates here? | |||
| G_list = [ggg.copy() for ggg in G_new_list] | |||
| pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list] | |||
| dis_list = dis_new_list[:] | |||
| old_sod = cur_sod | |||
| cur_sod = np.min(dis_list) | |||
| sod_list.append(cur_sod) | |||
| itr += 1 | |||
| # @todo: do we return all graphs or the best ones? | |||
| # get the best ones of the generated graphs. | |||
| G_list, pi_forward_list, dis_min = best_median_graphs( | |||
| G_list, pi_forward_list, dis_list) | |||
| if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: | |||
| G_list, idx_list = remove_duplicates(G_list) | |||
| pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||
| # dis_list = [dis_list[idx] for idx in idx_list] | |||
| # import matplotlib.pyplot as plt | |||
| # for g in G_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| print('\nsods:', sod_list, '\n') | |||
| return G_list, pi_forward_list, dis_min, sod_list | |||
| def remove_duplicates(Gn): | |||
| """Remove duplicate graphs from list. | |||
| """ | |||
| Gn_new = [] | |||
| idx_list = [] | |||
| for idx, g in enumerate(Gn): | |||
| dupl = False | |||
| for g_new in Gn_new: | |||
| if graph_isIdentical(g_new, g): | |||
| dupl = True | |||
| break | |||
| if not dupl: | |||
| Gn_new.append(g) | |||
| idx_list.append(idx) | |||
| return Gn_new, idx_list | |||
| def remove_disconnected(Gn): | |||
| """Remove disconnected graphs from list. | |||
| """ | |||
| Gn_new = [] | |||
| idx_list = [] | |||
| for idx, g in enumerate(Gn): | |||
| if nx.is_connected(g): | |||
| Gn_new.append(g) | |||
| idx_list.append(idx) | |||
| return Gn_new, idx_list | |||
| ########################################################################### | |||
| # phase 1: initilize. | |||
| # compute set-median. | |||
| dis_min = np.inf | |||
| dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, | |||
| params_ged=params_ged, parallel=True) | |||
| print('finish computing GEDs.') | |||
| # find all smallest distances. | |||
| if allBestInit: # try all best init graphs. | |||
| idx_min_list = range(len(dis_list)) | |||
| dis_min = dis_list | |||
| else: | |||
| idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() | |||
| dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) | |||
| idx_min_rdm = random.randint(0, len(idx_min_list) - 1) | |||
| idx_min_list = [idx_min_list[idx_min_rdm]] | |||
| sod_set_median = np.min(dis_min) | |||
| # phase 2: iteration. | |||
| G_list = [] | |||
| dis_list = [] | |||
| pi_forward_list = [] | |||
| G_set_median_list = [] | |||
| # sod_list = [] | |||
| for idx_tmp, idx_min in enumerate(idx_min_list): | |||
| # print('idx_min is', idx_min) | |||
| G = Gn_candidate[idx_min].copy() | |||
| G_set_median_list.append(G.copy()) | |||
| # list of edit operations. | |||
| pi_p_forward = pi_forward_all[idx_min] | |||
| # pi_p_backward = pi_all_backward[idx_min] | |||
| Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, | |||
| pi_p_forward, dis_min[idx_tmp]) | |||
| G_list += Gi_list | |||
| dis_list += [dis_i_min] * len(Gi_list) | |||
| pi_forward_list += pi_i_forward_list | |||
| if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: | |||
| G_list, idx_list = remove_duplicates(G_list) | |||
| dis_list = [dis_list[idx] for idx in idx_list] | |||
| pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||
| if connected == True: | |||
| G_list_con, idx_list = remove_disconnected(G_list) | |||
| # if there is no connected graphs at all, then remain the disconnected ones. | |||
| if len(G_list_con) > 0: # @todo: ?????????????????????????? | |||
| G_list = G_list_con | |||
| dis_list = [dis_list[idx] for idx in idx_list] | |||
| pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||
| # import matplotlib.pyplot as plt | |||
| # for g in G_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # get the best median graphs | |||
| G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( | |||
| G_list, pi_forward_list, dis_list) | |||
| # for g in G_gen_median_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| if not allBestOutput: | |||
| # randomly choose one graph. | |||
| idx_rdm = random.randint(0, len(G_gen_median_list) - 1) | |||
| G_gen_median_list = [G_gen_median_list[idx_rdm]] | |||
| return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | |||
| def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1, | |||
| dataset='monoterpenoides', | |||
| graph_dir=''): | |||
| """Compute the iam by c++ implementation (gedlib) through bash. | |||
| """ | |||
| import os | |||
| import time | |||
| def createCollectionFile(Gn_names, y, filename): | |||
| """Create collection file. | |||
| """ | |||
| dirname_ds = os.path.dirname(filename) | |||
| if dirname_ds != '': | |||
| dirname_ds += '/' | |||
| if not os.path.exists(dirname_ds) : | |||
| os.makedirs(dirname_ds) | |||
| with open(filename + '.xml', 'w') as fgroup: | |||
| fgroup.write("<?xml version=\"1.0\"?>") | |||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||
| fgroup.write("\n<GraphCollection>") | |||
| for idx, fname in enumerate(Gn_names): | |||
| fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>") | |||
| fgroup.write("\n</GraphCollection>") | |||
| fgroup.close() | |||
| tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' | |||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||
| createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) | |||
| # fn_collection = tmp_dir + 'collection_for_debug' | |||
| # graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl' | |||
| # if dataset == 'Letter-high' or dataset == 'Fingerprint': | |||
| # dataset = 'letter' | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' | |||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||
| command += 'export LD_LIBRARY_PATH\n' | |||
| command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' | |||
| command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | |||
| + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' ' | |||
| if edit_cost_constant is None: | |||
| command += 'None' | |||
| else: | |||
| for ec in edit_cost_constant: | |||
| command += str(ec) + ' ' | |||
| # output = os.system(command) | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| # print(output) | |||
| sod_sm = float(output[0].strip()) | |||
| sod_gm = float(output[1].strip()) | |||
| fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' | |||
| fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' | |||
| return sod_sm, sod_gm, fname_sm, fname_gm | |||
| ############################################################################### | |||
| # Old implementations. | |||
| def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', | |||
| connected=True): | |||
| """See my name, then you know what I do. | |||
| """ | |||
| # Gn = Gn[0:10] | |||
| Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | |||
| # phase 1: initilize. | |||
| # compute set-median. | |||
| dis_min = np.inf | |||
| pi_p = [] | |||
| pi_all = [] | |||
| for idx1, G_p in enumerate(Gn): | |||
| dist_sum = 0 | |||
| pi_all.append([]) | |||
| for idx2, G_p_prime in enumerate(Gn): | |||
| dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) | |||
| pi_all[idx1].append(pi_tmp) | |||
| dist_sum += dist_tmp | |||
| if dist_sum < dis_min: | |||
| dis_min = dist_sum | |||
| G = G_p.copy() | |||
| idx_min = idx1 | |||
| # list of edit operations. | |||
| pi_p = pi_all[idx_min] | |||
| # phase 2: iteration. | |||
| ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], | |||
| edge_label=edge_label) | |||
| for itr in range(0, 10): # @todo: the convergence condition? | |||
| G_new = G.copy() | |||
| # update vertex labels. | |||
| # pre-compute h_i0 for each label. | |||
| # for label in get_node_labels(Gn, node_label): | |||
| # print(label) | |||
| # for nd in G.nodes(data=True): | |||
| # pass | |||
| if not ds_attrs['node_attr_dim']: # labels are symbolic | |||
| for nd, _ in G.nodes(data=True): | |||
| h_i0_list = [] | |||
| label_list = [] | |||
| for label in get_node_labels(Gn, node_label): | |||
| h_i0 = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p[idx][nd] | |||
| if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||
| h_i0 += 1 | |||
| h_i0_list.append(h_i0) | |||
| label_list.append(label) | |||
| # choose one of the best randomly. | |||
| idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] | |||
| else: # labels are non-symbolic | |||
| for nd, _ in G.nodes(data=True): | |||
| Si_norm = 0 | |||
| phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p[idx][nd] | |||
| if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||
| Si_norm += 1 | |||
| phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||
| phi_i_bar /= Si_norm | |||
| G_new.nodes[nd]['attributes'] = phi_i_bar | |||
| # update edge labels and adjacency matrix. | |||
| if ds_attrs['edge_labeled']: | |||
| for nd1, nd2, _ in G.edges(data=True): | |||
| h_ij0_list = [] | |||
| label_list = [] | |||
| for label in get_edge_labels(Gn, edge_label): | |||
| h_ij0 = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p[idx][nd1] | |||
| pi_j = pi_p[idx][nd2] | |||
| h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||
| g.has_edge(pi_i, pi_j) and | |||
| g.edges[pi_i, pi_j][edge_label] == label) | |||
| h_ij0 += h_ij0_p | |||
| h_ij0_list.append(h_ij0) | |||
| label_list.append(label) | |||
| # choose one of the best randomly. | |||
| idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||
| h_ij0_max = h_ij0_list[idx_max[0]] | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| best_label = label_list[idx_max[idx_rdm]] | |||
| # check whether a_ij is 0 or 1. | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p[idx][nd1] | |||
| pi_j = pi_p[idx][nd2] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||
| if not G_new.has_edge(nd1, nd2): | |||
| G_new.add_edge(nd1, nd2) | |||
| G_new.edges[nd1, nd2][edge_label] = best_label | |||
| else: | |||
| if G_new.has_edge(nd1, nd2): | |||
| G_new.remove_edge(nd1, nd2) | |||
| else: # if edges are unlabeled | |||
| for nd1, nd2, _ in G.edges(data=True): | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p[idx][nd1] | |||
| pi_j = pi_p[idx][nd2] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if sij_norm > len(Gn) * c_er / (c_er + c_ei): | |||
| if not G_new.has_edge(nd1, nd2): | |||
| G_new.add_edge(nd1, nd2) | |||
| else: | |||
| if G_new.has_edge(nd1, nd2): | |||
| G_new.remove_edge(nd1, nd2) | |||
| G = G_new.copy() | |||
| # update pi_p | |||
| pi_p = [] | |||
| for idx1, G_p in enumerate(Gn): | |||
| dist_tmp, pi_tmp, _ = GED(G, G_p) | |||
| pi_p.append(pi_tmp) | |||
| return G | |||
| # --------------------------- These are tests --------------------------------# | |||
| def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, | |||
| node_label='atom', edge_label='bond_type'): | |||
| """See my name, then you know what I do. | |||
| """ | |||
| # Gn = Gn[0:10] | |||
| Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | |||
| # phase 1: initilize. | |||
| # compute set-median. | |||
| dis_min = np.inf | |||
| # pi_p = [] | |||
| pi_all_forward = [] | |||
| pi_all_backward = [] | |||
| for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout): | |||
| dist_sum = 0 | |||
| pi_all_forward.append([]) | |||
| pi_all_backward.append([]) | |||
| for idx2, G_p_prime in enumerate(Gn): | |||
| dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime) | |||
| pi_all_forward[idx1].append(pi_tmp_forward) | |||
| pi_all_backward[idx1].append(pi_tmp_backward) | |||
| dist_sum += dist_tmp | |||
| if dist_sum <= dis_min: | |||
| dis_min = dist_sum | |||
| G = G_p.copy() | |||
| idx_min = idx1 | |||
| # list of edit operations. | |||
| pi_p_forward = pi_all_forward[idx_min] | |||
| pi_p_backward = pi_all_backward[idx_min] | |||
| # phase 2: iteration. | |||
| ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], | |||
| edge_label=edge_label) | |||
| label_set = get_node_labels(Gn + [G], node_label) | |||
| for itr in range(0, 10): # @todo: the convergence condition? | |||
| G_new = G.copy() | |||
| # update vertex labels. | |||
| # pre-compute h_i0 for each label. | |||
| # for label in get_node_labels(Gn, node_label): | |||
| # print(label) | |||
| # for nd in G.nodes(data=True): | |||
| # pass | |||
| if not ds_attrs['node_attr_dim']: # labels are symbolic | |||
| for nd in G.nodes(): | |||
| h_i0_list = [] | |||
| label_list = [] | |||
| for label in label_set: | |||
| h_i0 = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p_forward[idx][nd] | |||
| if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||
| h_i0 += 1 | |||
| h_i0_list.append(h_i0) | |||
| label_list.append(label) | |||
| # choose one of the best randomly. | |||
| idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] | |||
| else: # labels are non-symbolic | |||
| for nd in G.nodes(): | |||
| Si_norm = 0 | |||
| phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p_forward[idx][nd] | |||
| if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||
| Si_norm += 1 | |||
| phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||
| phi_i_bar /= Si_norm | |||
| G_new.nodes[nd]['attributes'] = phi_i_bar | |||
| # update edge labels and adjacency matrix. | |||
| if ds_attrs['edge_labeled']: | |||
| for nd1, nd2, _ in G.edges(data=True): | |||
| h_ij0_list = [] | |||
| label_list = [] | |||
| for label in get_edge_labels(Gn, edge_label): | |||
| h_ij0 = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p_forward[idx][nd1] | |||
| pi_j = pi_p_forward[idx][nd2] | |||
| h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||
| g.has_edge(pi_i, pi_j) and | |||
| g.edges[pi_i, pi_j][edge_label] == label) | |||
| h_ij0 += h_ij0_p | |||
| h_ij0_list.append(h_ij0) | |||
| label_list.append(label) | |||
| # choose one of the best randomly. | |||
| idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||
| h_ij0_max = h_ij0_list[idx_max[0]] | |||
| idx_rdm = random.randint(0, len(idx_max) - 1) | |||
| best_label = label_list[idx_max[idx_rdm]] | |||
| # check whether a_ij is 0 or 1. | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p_forward[idx][nd1] | |||
| pi_j = pi_p_forward[idx][nd2] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||
| if not G_new.has_edge(nd1, nd2): | |||
| G_new.add_edge(nd1, nd2) | |||
| G_new.edges[nd1, nd2][edge_label] = best_label | |||
| else: | |||
| if G_new.has_edge(nd1, nd2): | |||
| G_new.remove_edge(nd1, nd2) | |||
| else: # if edges are unlabeled | |||
| # @todo: works only for undirected graphs. | |||
| for nd1 in range(nx.number_of_nodes(G)): | |||
| for nd2 in range(nd1 + 1, nx.number_of_nodes(G)): | |||
| sij_norm = 0 | |||
| for idx, g in enumerate(Gn): | |||
| pi_i = pi_p_forward[idx][nd1] | |||
| pi_j = pi_p_forward[idx][nd2] | |||
| if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
| sij_norm += 1 | |||
| if sij_norm > len(Gn) * c_er / (c_er + c_ei): | |||
| if not G_new.has_edge(nd1, nd2): | |||
| G_new.add_edge(nd1, nd2) | |||
| elif sij_norm < len(Gn) * c_er / (c_er + c_ei): | |||
| if G_new.has_edge(nd1, nd2): | |||
| G_new.remove_edge(nd1, nd2) | |||
| # do not change anything when equal. | |||
| G = G_new.copy() | |||
| # update pi_p | |||
| pi_p_forward = [] | |||
| for G_p in Gn: | |||
| dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) | |||
| pi_p_forward.append(pi_tmp_forward) | |||
| return G | |||
| ############################################################################### | |||
| if __name__ == '__main__': | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
| # ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||
| # 'extra_params': {}} | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| iam(Gn) | |||
| @@ -1,114 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Jan 10 13:22:04 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| #import matplotlib.pyplot as plt | |||
| from tqdm import tqdm | |||
| import random | |||
| #import csv | |||
| from shutil import copyfile | |||
| import os | |||
| from gklearn.preimage.iam import iam_bash | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL | |||
| from gklearn.preimage.ged import GED | |||
| from gklearn.preimage.utils import get_same_item_indices | |||
| def test_knn(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| # gkernel = 'treeletkernel' | |||
| # node_label = 'atom' | |||
| # edge_label = 'bond_type' | |||
| # ds_name = 'mono' | |||
| dir_output = 'results/knn/' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/' | |||
| k_nn = 1 | |||
| percent = 0.1 | |||
| repeats = 50 | |||
| edit_cost_constant = [3, 3, 1, 3, 3, 1] | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| sod_sm_list_list | |||
| for repeat in range(0, repeats): | |||
| print('\n---------------------------------') | |||
| print('repeat =', repeat) | |||
| accuracy_sm_list = [] | |||
| accuracy_gm_list = [] | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| random.seed(repeat) | |||
| set_median_list = [] | |||
| gen_median_list = [] | |||
| train_y_set = [] | |||
| for y, values in y_idx.items(): | |||
| print('\ny =', y) | |||
| size_median_set = int(len(values) * percent) | |||
| median_set_idx = random.sample(values, size_median_set) | |||
| print('median set: ', median_set_idx) | |||
| # compute set median and gen median using IAM (C++ through bash). | |||
| # Gn_median = [Gn[idx] for idx in median_set_idx] | |||
| group_fnames = [Gn[g].graph['filename'] for g in median_set_idx] | |||
| sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | |||
| graph_dir=graph_dir) | |||
| print('sod_sm, sod_gm:', sod_sm, sod_gm) | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| copyfile(fname_sm, fname_sm_new) | |||
| fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| copyfile(fname_gm, fname_gm_new) | |||
| set_median_list.append(loadGXL(fname_sm_new)) | |||
| gen_median_list.append(loadGXL(fname_gm_new)) | |||
| train_y_set.append(int(y)) | |||
| print(sod_sm, sod_gm) | |||
| # do 1-nn. | |||
| test_y_set = [int(y) for y in y_all] | |||
| accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') | |||
| accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') | |||
| accuracy_sm_list.append(accuracy_sm) | |||
| accuracy_gm_list.append(accuracy_gm) | |||
| print('current accuracy sm and gm:', accuracy_sm, accuracy_gm) | |||
| # output | |||
| accuracy_sm_mean = np.mean(accuracy_sm_list) | |||
| accuracy_gm_mean = np.mean(accuracy_gm_list) | |||
| print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean) | |||
| def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'): | |||
| if k == 1 and distance == 'ged': | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| accuracy = 0 | |||
| for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn', | |||
| file=sys.stdout): | |||
| dis = np.inf | |||
| for idx_train, g_train in enumerate(train_set): | |||
| dis_cur, _, _ = GED(g_test, g_train, **params_ged) | |||
| if dis_cur < dis: | |||
| dis = dis_cur | |||
| test_y_cur = train_y_set[idx_train] | |||
| if test_y_cur == test_y_set[idx_test]: | |||
| accuracy += 1 | |||
| accuracy = accuracy / len(test_set) | |||
| return accuracy | |||
| if __name__ == '__main__': | |||
| test_knn() | |||
| @@ -1,6 +0,0 @@ | |||
| import sys | |||
| import pathlib | |||
| # insert gedlibpy library. | |||
| sys.path.insert(0, "../../../") | |||
| from gedlibpy import librariesImport, gedlibpy | |||
| @@ -1,218 +0,0 @@ | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| #import pathlib | |||
| import numpy as np | |||
| import networkx as nx | |||
| import time | |||
| from gedlibpy import librariesImport, gedlibpy | |||
| #import script | |||
| sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") | |||
| import gklearn | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| def replace_graph_in_env(script, graph, old_id, label='median'): | |||
| """ | |||
| Replace a graph in script | |||
| If old_id is -1, add a new graph to the environnemt | |||
| """ | |||
| if(old_id > -1): | |||
| script.PyClearGraph(old_id) | |||
| new_id = script.PyAddGraph(label) | |||
| for i in graph.nodes(): | |||
| script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib | |||
| for e in graph.edges: | |||
| script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) | |||
| script.PyInitEnv() | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| return new_id | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph, savepath=''): | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['attributes'][0]), | |||
| float(graph.node[n]['attributes'][1])]) | |||
| nx.draw_networkx(graph, pos) | |||
| if savepath != '': | |||
| plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300) | |||
| plt.show() | |||
| plt.clf() | |||
| #compute new mappings | |||
| def update_mappings(script,median_id,listID): | |||
| med_distances = {} | |||
| med_mappings = {} | |||
| sod = 0 | |||
| for i in range(0,len(listID)): | |||
| script.PyRunMethod(median_id,listID[i]) | |||
| med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) | |||
| med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) | |||
| sod += med_distances[i] | |||
| return med_distances, med_mappings, sod | |||
| def calcul_Sij(all_mappings, all_graphs,i,j): | |||
| s_ij = 0 | |||
| for k in range(0,len(all_mappings)): | |||
| cur_graph = all_graphs[k] | |||
| cur_mapping = all_mappings[k] | |||
| size_graph = cur_graph.order() | |||
| if ((cur_mapping[i] < size_graph) and | |||
| (cur_mapping[j] < size_graph) and | |||
| (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): | |||
| s_ij += 1 | |||
| return s_ij | |||
| # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): | |||
| # from scipy.stats.mstats import gmean | |||
| # for i in median.nodes(): | |||
| # for k in listIdSet: | |||
| # vectors = [] #np.zeros((len(listIdSet),2)) | |||
| # if(k != median_id): | |||
| # phi_i = mappings[k][i] | |||
| # if(phi_i < dataset[k].order()): | |||
| # vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) | |||
| # new_labels = gmean(vectors) | |||
| # median.node[i]['x'] = str(new_labels[0]) | |||
| # median.node[i]['y'] = str(new_labels[1]) | |||
| # return median | |||
| def update_median_nodes(median,dataset,mappings): | |||
| #update node attributes | |||
| for i in median.nodes(): | |||
| nb_sub=0 | |||
| mean_label = {'x' : 0, 'y' : 0} | |||
| for k in range(0,len(mappings)): | |||
| phi_i = mappings[k][i] | |||
| if ( phi_i < dataset[k].order() ): | |||
| nb_sub += 1 | |||
| mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) | |||
| mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) | |||
| median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) | |||
| median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) | |||
| return median | |||
| def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): | |||
| #for letter high, ceir = 1.7, alpha = 0.75 | |||
| size_dataset = len(dataset) | |||
| ratio_cei_cer = cer/(cei + cer) | |||
| threshold = size_dataset*ratio_cei_cer | |||
| order_graph_median = median.order() | |||
| for i in range(0,order_graph_median): | |||
| for j in range(i+1,order_graph_median): | |||
| s_ij = calcul_Sij(mappings,dataset,i,j) | |||
| if(s_ij > threshold): | |||
| median.add_edge(i,j) | |||
| else: | |||
| if(median.has_edge(i,j)): | |||
| median.remove_edge(i,j) | |||
| return median | |||
| def compute_median(script, listID, dataset,verbose=False): | |||
| """Compute a graph median of a dataset according to an environment | |||
| Parameters | |||
| script : An gedlib initialized environnement | |||
| listID (list): a list of ID in script: encodes the dataset | |||
| dataset (list): corresponding graphs in networkX format. We assume that graph | |||
| listID[i] corresponds to dataset[i] | |||
| Returns: | |||
| A networkX graph, which is the median, with corresponding sod | |||
| """ | |||
| print(len(listID)) | |||
| median_set_index, median_set_sod = compute_median_set(script, listID) | |||
| print(median_set_index) | |||
| print(median_set_sod) | |||
| sods = [] | |||
| #Ajout median dans environnement | |||
| set_median = dataset[median_set_index].copy() | |||
| median = dataset[median_set_index].copy() | |||
| cur_med_id = replace_graph_in_env(script,median,-1) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite_max = 50 | |||
| old_sod = cur_sod * 2 | |||
| ite = 0 | |||
| epsilon = 0.001 | |||
| best_median | |||
| while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): | |||
| median = update_median_nodes(median,dataset, med_mappings) | |||
| median = update_median_edges(dataset,med_mappings,median) | |||
| cur_med_id = replace_graph_in_env(script,median,cur_med_id) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite += 1 | |||
| return median, cur_sod, sods, set_median | |||
| draw_Letter_graph(median) | |||
| def compute_median_set(script,listID): | |||
| 'Returns the id in listID corresponding to median set' | |||
| #Calcul median set | |||
| N=len(listID) | |||
| map_id_to_index = {} | |||
| map_index_to_id = {} | |||
| for i in range(0,len(listID)): | |||
| map_id_to_index[listID[i]] = i | |||
| map_index_to_id[i] = listID[i] | |||
| distances = np.zeros((N,N)) | |||
| for i in listID: | |||
| for j in listID: | |||
| script.PyRunMethod(i,j) | |||
| distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) | |||
| median_set_index = np.argmin(np.sum(distances,0)) | |||
| sod = np.min(np.sum(distances,0)) | |||
| return median_set_index, sod | |||
| if __name__ == "__main__": | |||
| #Chargement du dataset | |||
| script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') | |||
| script.PySetEditCost("LETTER") | |||
| script.PyInitEnv() | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") | |||
| listID = script.PyGetAllGraphIds() | |||
| median, sod = compute_median(script,listID,dataset,verbose=True) | |||
| print(sod) | |||
| draw_Letter_graph(median) | |||
| #if __name__ == '__main__': | |||
| # # test draw_Letter_graph | |||
| # ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # print(y_all) | |||
| # for g in Gn: | |||
| # draw_Letter_graph(g) | |||
| @@ -1,201 +0,0 @@ | |||
| import sys | |||
| import pathlib | |||
| import numpy as np | |||
| import networkx as nx | |||
| import librariesImport | |||
| import script | |||
| sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") | |||
| import gklearn | |||
| def replace_graph_in_env(script, graph, old_id, label='median'): | |||
| """ | |||
| Replace a graph in script | |||
| If old_id is -1, add a new graph to the environnemt | |||
| """ | |||
| if(old_id > -1): | |||
| script.PyClearGraph(old_id) | |||
| new_id = script.PyAddGraph(label) | |||
| for i in graph.nodes(): | |||
| script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib | |||
| for e in graph.edges: | |||
| script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) | |||
| script.PyInitEnv() | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| return new_id | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph): | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
| nx.draw_networkx(graph,pos) | |||
| plt.show() | |||
| #compute new mappings | |||
| def update_mappings(script,median_id,listID): | |||
| med_distances = {} | |||
| med_mappings = {} | |||
| sod = 0 | |||
| for i in range(0,len(listID)): | |||
| script.PyRunMethod(median_id,listID[i]) | |||
| med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) | |||
| med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) | |||
| sod += med_distances[i] | |||
| return med_distances, med_mappings, sod | |||
| def calcul_Sij(all_mappings, all_graphs,i,j): | |||
| s_ij = 0 | |||
| for k in range(0,len(all_mappings)): | |||
| cur_graph = all_graphs[k] | |||
| cur_mapping = all_mappings[k] | |||
| size_graph = cur_graph.order() | |||
| if ((cur_mapping[i] < size_graph) and | |||
| (cur_mapping[j] < size_graph) and | |||
| (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): | |||
| s_ij += 1 | |||
| return s_ij | |||
| # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): | |||
| # from scipy.stats.mstats import gmean | |||
| # for i in median.nodes(): | |||
| # for k in listIdSet: | |||
| # vectors = [] #np.zeros((len(listIdSet),2)) | |||
| # if(k != median_id): | |||
| # phi_i = mappings[k][i] | |||
| # if(phi_i < dataset[k].order()): | |||
| # vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) | |||
| # new_labels = gmean(vectors) | |||
| # median.node[i]['x'] = str(new_labels[0]) | |||
| # median.node[i]['y'] = str(new_labels[1]) | |||
| # return median | |||
| def update_median_nodes(median,dataset,mappings): | |||
| #update node attributes | |||
| for i in median.nodes(): | |||
| nb_sub=0 | |||
| mean_label = {'x' : 0, 'y' : 0} | |||
| for k in range(0,len(mappings)): | |||
| phi_i = mappings[k][i] | |||
| if ( phi_i < dataset[k].order() ): | |||
| nb_sub += 1 | |||
| mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) | |||
| mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) | |||
| median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) | |||
| median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) | |||
| return median | |||
| def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): | |||
| #for letter high, ceir = 1.7, alpha = 0.75 | |||
| size_dataset = len(dataset) | |||
| ratio_cei_cer = cer/(cei + cer) | |||
| threshold = size_dataset*ratio_cei_cer | |||
| order_graph_median = median.order() | |||
| for i in range(0,order_graph_median): | |||
| for j in range(i+1,order_graph_median): | |||
| s_ij = calcul_Sij(mappings,dataset,i,j) | |||
| if(s_ij > threshold): | |||
| median.add_edge(i,j) | |||
| else: | |||
| if(median.has_edge(i,j)): | |||
| median.remove_edge(i,j) | |||
| return median | |||
| def compute_median(script, listID, dataset,verbose=False): | |||
| """Compute a graph median of a dataset according to an environment | |||
| Parameters | |||
| script : An gedlib initialized environnement | |||
| listID (list): a list of ID in script: encodes the dataset | |||
| dataset (list): corresponding graphs in networkX format. We assume that graph | |||
| listID[i] corresponds to dataset[i] | |||
| Returns: | |||
| A networkX graph, which is the median, with corresponding sod | |||
| """ | |||
| print(len(listID)) | |||
| median_set_index, median_set_sod = compute_median_set(script, listID) | |||
| print(median_set_index) | |||
| print(median_set_sod) | |||
| sods = [] | |||
| #Ajout median dans environnement | |||
| set_median = dataset[median_set_index].copy() | |||
| median = dataset[median_set_index].copy() | |||
| cur_med_id = replace_graph_in_env(script,median,-1) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite_max = 50 | |||
| old_sod = cur_sod * 2 | |||
| ite = 0 | |||
| epsilon = 0.001 | |||
| best_median | |||
| while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): | |||
| median = update_median_nodes(median,dataset, med_mappings) | |||
| median = update_median_edges(dataset,med_mappings,median) | |||
| cur_med_id = replace_graph_in_env(script,median,cur_med_id) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite += 1 | |||
| return median, cur_sod, sods, set_median | |||
| draw_Letter_graph(median) | |||
| def compute_median_set(script,listID): | |||
| 'Returns the id in listID corresponding to median set' | |||
| #Calcul median set | |||
| N=len(listID) | |||
| map_id_to_index = {} | |||
| map_index_to_id = {} | |||
| for i in range(0,len(listID)): | |||
| map_id_to_index[listID[i]] = i | |||
| map_index_to_id[i] = listID[i] | |||
| distances = np.zeros((N,N)) | |||
| for i in listID: | |||
| for j in listID: | |||
| script.PyRunMethod(i,j) | |||
| distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) | |||
| median_set_index = np.argmin(np.sum(distances,0)) | |||
| sod = np.min(np.sum(distances,0)) | |||
| return median_set_index, sod | |||
| if __name__ == "__main__": | |||
| #Chargement du dataset | |||
| script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') | |||
| script.PySetEditCost("LETTER") | |||
| script.PyInitEnv() | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") | |||
| listID = script.PyGetAllGraphIds() | |||
| median, sod = compute_median(script,listID,dataset,verbose=True) | |||
| print(sod) | |||
| draw_Letter_graph(median) | |||
| @@ -1,826 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Mar 16 18:04:55 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| from gklearn.preimage.common_types import AlgorithmState | |||
| from gklearn.preimage import misc | |||
| from gklearn.preimage.timer import Timer | |||
| from gklearn.utils.utils import graph_isIdentical | |||
| import time | |||
| from tqdm import tqdm | |||
| import sys | |||
| import networkx as nx | |||
| class MedianGraphEstimator(object): | |||
| def __init__(self, ged_env, constant_node_costs): | |||
| """Constructor. | |||
| Parameters | |||
| ---------- | |||
| ged_env : gklearn.gedlib.gedlibpy.GEDEnv | |||
| Initialized GED environment. The edit costs must be set by the user. | |||
| constant_node_costs : Boolean | |||
| Set to True if the node relabeling costs are constant. | |||
| """ | |||
| self.__ged_env = ged_env | |||
| self.__init_method = 'BRANCH_FAST' | |||
| self.__init_options = '' | |||
| self.__descent_method = 'BRANCH_FAST' | |||
| self.__descent_options = '' | |||
| self.__refine_method = 'IPFP' | |||
| self.__refine_options = '' | |||
| self.__constant_node_costs = constant_node_costs | |||
| self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) | |||
| self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1)) | |||
| self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1)) | |||
| self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) | |||
| self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1)) | |||
| self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1)) | |||
| self.__init_type = 'RANDOM' | |||
| self.__num_random_inits = 10 | |||
| self.__desired_num_random_inits = 10 | |||
| self.__use_real_randomness = True | |||
| self.__seed = 0 | |||
| self.__refine = True | |||
| self.__time_limit_in_sec = 0 | |||
| self.__epsilon = 0.0001 | |||
| self.__max_itrs = 100 | |||
| self.__max_itrs_without_update = 3 | |||
| self.__num_inits_increase_order = 10 | |||
| self.__init_type_increase_order = 'K-MEANS++' | |||
| self.__max_itrs_increase_order = 10 | |||
| self.__print_to_stdout = 2 | |||
| self.__median_id = np.inf # @todo: check | |||
| self.__median_node_id_prefix = '' # @todo: check | |||
| self.__node_maps_from_median = {} | |||
| self.__sum_of_distances = 0 | |||
| self.__best_init_sum_of_distances = np.inf | |||
| self.__converged_sum_of_distances = np.inf | |||
| self.__runtime = None | |||
| self.__runtime_initialized = None | |||
| self.__runtime_converged = None | |||
| self.__itrs = [] # @todo: check: {} ? | |||
| self.__num_decrease_order = 0 | |||
| self.__num_increase_order = 0 | |||
| self.__num_converged_descents = 0 | |||
| self.__state = AlgorithmState.TERMINATED | |||
| if ged_env is None: | |||
| raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') | |||
| elif not ged_env.is_initialized(): | |||
| raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') | |||
| def set_options(self, options): | |||
| """Sets the options of the estimator. | |||
| Parameters | |||
| ---------- | |||
| options : string | |||
| String that specifies with which options to run the estimator. | |||
| """ | |||
| self.__set_default_options() | |||
| options_map = misc.options_string_to_options_map(options) | |||
| for opt_name, opt_val in options_map.items(): | |||
| if opt_name == 'init-type': | |||
| self.__init_type = opt_val | |||
| if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': | |||
| raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') | |||
| elif opt_name == 'random-inits': | |||
| try: | |||
| self.__num_random_inits = int(opt_val) | |||
| self.__desired_num_random_inits = self.__num_random_inits | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"') | |||
| if self.__num_random_inits <= 0: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"') | |||
| elif opt_name == 'randomness': | |||
| if opt_val == 'PSEUDO': | |||
| self.__use_real_randomness = False | |||
| elif opt_val == 'REAL': | |||
| self.__use_real_randomness = True | |||
| else: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') | |||
| elif opt_name == 'stdout': | |||
| if opt_val == '0': | |||
| self.__print_to_stdout = 0 | |||
| elif opt_val == '1': | |||
| self.__print_to_stdout = 1 | |||
| elif opt_val == '2': | |||
| self.__print_to_stdout = 2 | |||
| else: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') | |||
| elif opt_name == 'refine': | |||
| if opt_val == 'TRUE': | |||
| self.__refine = True | |||
| elif opt_val == 'FALSE': | |||
| self.__refine = False | |||
| else: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') | |||
| elif opt_name == 'time-limit': | |||
| try: | |||
| self.__time_limit_in_sec = float(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit <convertible to double>] [...]') | |||
| elif opt_name == 'max-itrs': | |||
| try: | |||
| self.__max_itrs = int(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]') | |||
| elif opt_name == 'max-itrs-without-update': | |||
| try: | |||
| self.__max_itrs_without_update = int(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]') | |||
| elif opt_name == 'seed': | |||
| try: | |||
| self.__seed = int(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]') | |||
| elif opt_name == 'epsilon': | |||
| try: | |||
| self.__epsilon = float(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]') | |||
| if self.__epsilon <= 0: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]') | |||
| elif opt_name == 'inits-increase-order': | |||
| try: | |||
| self.__num_inits_increase_order = int(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"') | |||
| if self.__num_inits_increase_order <= 0: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"') | |||
| elif opt_name == 'init-type-increase-order': | |||
| self.__init_type_increase_order = opt_val | |||
| if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': | |||
| raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') | |||
| elif opt_name == 'max-itrs-increase-order': | |||
| try: | |||
| self.__max_itrs_increase_order = int(opt_val) | |||
| except: | |||
| raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]') | |||
| else: | |||
| valid_options = '[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] ' | |||
| valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] ' | |||
| valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]' | |||
| raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') | |||
| def set_init_method(self, init_method, init_options=''): | |||
| """Selects method to be used for computing the initial medoid graph. | |||
| Parameters | |||
| ---------- | |||
| init_method : string | |||
| The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. | |||
| init_options : string | |||
| The options for the selected method. Default: "". | |||
| Notes | |||
| ----- | |||
| Has no effect unless "--init-type MEDOID" is passed to set_options(). | |||
| """ | |||
| self.__init_method = init_method; | |||
| self.__init_options = init_options; | |||
| def set_descent_method(self, descent_method, descent_options=''): | |||
| """Selects method to be used for block gradient descent.. | |||
| Parameters | |||
| ---------- | |||
| descent_method : string | |||
| The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. | |||
| descent_options : string | |||
| The options for the selected method. Default: "". | |||
| Notes | |||
| ----- | |||
| Has no effect unless "--init-type MEDOID" is passed to set_options(). | |||
| """ | |||
| self.__descent_method = descent_method; | |||
| self.__descent_options = descent_options; | |||
| def set_refine_method(self, refine_method, refine_options): | |||
| """Selects method to be used for improving the sum of distances and the node maps for the converged median. | |||
| Parameters | |||
| ---------- | |||
| refine_method : string | |||
| The selected method. Default: "IPFP". | |||
| refine_options : string | |||
| The options for the selected method. Default: "". | |||
| Notes | |||
| ----- | |||
| Has no effect if "--refine FALSE" is passed to set_options(). | |||
| """ | |||
| self.__refine_method = refine_method | |||
| self.__refine_options = refine_options | |||
| def run(self, graph_ids, set_median_id, gen_median_id): | |||
| """Computes a generalized median graph. | |||
| Parameters | |||
| ---------- | |||
| graph_ids : list[integer] | |||
| The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. | |||
| set_median_id : integer | |||
| The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). | |||
| gen_median_id : integer | |||
| The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). | |||
| """ | |||
| # Sanity checks. | |||
| if len(graph_ids) == 0: | |||
| raise Exception('Empty vector of graph IDs, unable to compute median.') | |||
| all_graphs_empty = True | |||
| for graph_id in graph_ids: | |||
| if self.__ged_env.get_graph_num_nodes(graph_id) > 0: | |||
| self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0] | |||
| all_graphs_empty = False | |||
| break | |||
| if all_graphs_empty: | |||
| raise Exception('All graphs in the collection are empty.') | |||
| # Start timer and record start time. | |||
| start = time.time() | |||
| timer = Timer(self.__time_limit_in_sec) | |||
| self.__median_id = gen_median_id | |||
| self.__state = AlgorithmState.TERMINATED | |||
| # Get ExchangeGraph representations of the input graphs. | |||
| graphs = {} | |||
| for graph_id in graph_ids: | |||
| # @todo: get_nx_graph() function may need to be modified according to the coming code. | |||
| graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False) | |||
| # print(self.__ged_env.get_graph_internal_id(0)) | |||
| # print(graphs[0].graph) | |||
| # print(graphs[0].nodes(data=True)) | |||
| # print(graphs[0].edges(data=True)) | |||
| # print(nx.adjacency_matrix(graphs[0])) | |||
| # Construct initial medians. | |||
| medians = [] | |||
| self.__construct_initial_medians(graph_ids, timer, medians) | |||
| end_init = time.time() | |||
| self.__runtime_initialized = end_init - start | |||
| # print(medians[0].graph) | |||
| # print(medians[0].nodes(data=True)) | |||
| # print(medians[0].edges(data=True)) | |||
| # print(nx.adjacency_matrix(medians[0])) | |||
| # Reset information about iterations and number of times the median decreases and increases. | |||
| self.__itrs = [0] * len(medians) | |||
| self.__num_decrease_order = 0 | |||
| self.__num_increase_order = 0 | |||
| self.__num_converged_descents = 0 | |||
| # Initialize the best median. | |||
| best_sum_of_distances = np.inf | |||
| self.__best_init_sum_of_distances = np.inf | |||
| node_maps_from_best_median = {} | |||
| # Run block gradient descent from all initial medians. | |||
| self.__ged_env.set_method(self.__descent_method, self.__descent_options) | |||
| for median_pos in range(0, len(medians)): | |||
| # Terminate if the timer has expired and at least one SOD has been computed. | |||
| if timer.expired() and median_pos > 0: | |||
| break | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n===========================================================') | |||
| print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') | |||
| print('-----------------------------------------------------------') | |||
| # Get reference to the median. | |||
| median = medians[median_pos] | |||
| # Load initial median into the environment. | |||
| self.__ged_env.load_nx_graph(median, gen_median_id) | |||
| self.__ged_env.init(self.__ged_env.get_init_type()) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) | |||
| # Compute node maps and sum of distances for initial median. | |||
| self.__sum_of_distances = 0 | |||
| self.__node_maps_from_median.clear() # @todo | |||
| for graph_id in graph_ids: | |||
| self.__ged_env.run_method(gen_median_id, graph_id) | |||
| self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) | |||
| # print(self.__node_maps_from_median[graph_id]) | |||
| self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary. | |||
| # print(self.__sum_of_distances) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress.update(1) | |||
| self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) | |||
| self.__ged_env.load_nx_graph(median, set_median_id) | |||
| # print(self.__best_init_sum_of_distances) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n') | |||
| # Run block gradient descent from initial median. | |||
| converged = False | |||
| itrs_without_update = 0 | |||
| while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n===========================================================') | |||
| print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') | |||
| print('-----------------------------------------------------------') | |||
| # Initialize flags that tell us what happened in the iteration. | |||
| median_modified = False | |||
| node_maps_modified = False | |||
| decreased_order = False | |||
| increased_order = False | |||
| # Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!! | |||
| median_modified = self.__update_median(graphs, median) | |||
| if not median_modified or self.__itrs[median_pos] == 0: | |||
| decreased_order = False | |||
| if not decreased_order or self.__itrs[median_pos] == 0: | |||
| increased_order = False | |||
| # Update the number of iterations without update of the median. | |||
| if median_modified or decreased_order or increased_order: | |||
| itrs_without_update = 0 | |||
| else: | |||
| itrs_without_update += 1 | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('Loading median to environment: ... ', end='') | |||
| # Load the median into the environment. | |||
| # @todo: should this function use the original node label? | |||
| self.__ged_env.load_nx_graph(median, gen_median_id) | |||
| self.__ged_env.init(self.__ged_env.get_init_type()) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('done.') | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('Updating induced costs: ... ', end='') | |||
| # Compute induced costs of the old node maps w.r.t. the updated median. | |||
| for graph_id in graph_ids: | |||
| # print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) | |||
| # @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |||
| self.__ged_env.compute_induced_cost(gen_median_id, graph_id) | |||
| # print('---------------------------------------') | |||
| # print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('done.') | |||
| # Update the node maps. | |||
| node_maps_modified = self.__update_node_maps() # @todo | |||
| # Update the order of the median if no improvement can be found with the current order. | |||
| # Update the sum of distances. | |||
| old_sum_of_distances = self.__sum_of_distances | |||
| self.__sum_of_distances = 0 | |||
| for graph_id in self.__node_maps_from_median: | |||
| self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above. | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('Old local SOD: ', old_sum_of_distances) | |||
| print('New local SOD: ', self.__sum_of_distances) | |||
| print('Best converged SOD: ', best_sum_of_distances) | |||
| print('Modified median: ', median_modified) | |||
| print('Modified node maps: ', node_maps_modified) | |||
| print('Decreased order: ', decreased_order) | |||
| print('Increased order: ', increased_order) | |||
| print('===========================================================\n') | |||
| converged = not (median_modified or node_maps_modified or decreased_order or increased_order) | |||
| self.__itrs[median_pos] += 1 | |||
| # Update the best median. | |||
| if self.__sum_of_distances < self.__best_init_sum_of_distances: | |||
| best_sum_of_distances = self.__sum_of_distances | |||
| node_maps_from_best_median = self.__node_maps_from_median | |||
| best_median = median | |||
| # Update the number of converged descents. | |||
| if converged: | |||
| self.__num_converged_descents += 1 | |||
| # Store the best encountered median. | |||
| self.__sum_of_distances = best_sum_of_distances | |||
| self.__node_maps_from_median = node_maps_from_best_median | |||
| self.__ged_env.load_nx_graph(best_median, gen_median_id) | |||
| self.__ged_env.init(self.__ged_env.get_init_type()) | |||
| end_descent = time.time() | |||
| self.__runtime_converged = end_descent - start | |||
| # Refine the sum of distances and the node maps for the converged median. | |||
| self.__converged_sum_of_distances = self.__sum_of_distances | |||
| if self.__refine: | |||
| self.__improve_sum_of_distances(timer) # @todo | |||
| # Record end time, set runtime and reset the number of initial medians. | |||
| end = time.time() | |||
| self.__runtime = end - start | |||
| self.__num_random_inits = self.__desired_num_random_inits | |||
| # Print global information. | |||
| if self.__print_to_stdout != 0: | |||
| print('\n===========================================================') | |||
| print('Finished computation of generalized median graph.') | |||
| print('-----------------------------------------------------------') | |||
| print('Best SOD after initialization: ', self.__best_init_sum_of_distances) | |||
| print('Converged SOD: ', self.__converged_sum_of_distances) | |||
| if self.__refine: | |||
| print('Refined SOD: ', self.__sum_of_distances) | |||
| print('Overall runtime: ', self.__runtime) | |||
| print('Runtime of initialization: ', self.__runtime_initialized) | |||
| print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) | |||
| if self.__refine: | |||
| print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) | |||
| print('Number of initial medians: ', len(medians)) | |||
| total_itr = 0 | |||
| num_started_descents = 0 | |||
| for itr in self.__itrs: | |||
| total_itr += itr | |||
| if itr > 0: | |||
| num_started_descents += 1 | |||
| print('Size of graph collection: ', len(graph_ids)) | |||
| print('Number of started descents: ', num_started_descents) | |||
| print('Number of converged descents: ', self.__num_converged_descents) | |||
| print('Overall number of iterations: ', total_itr) | |||
| print('Overall number of times the order decreased: ', self.__num_decrease_order) | |||
| print('Overall number of times the order increased: ', self.__num_increase_order) | |||
| print('===========================================================\n') | |||
| def get_sum_of_distances(self, state=''): | |||
| """Returns the sum of distances. | |||
| Parameters | |||
| ---------- | |||
| state : string | |||
| The state of the estimator. Can be 'initialized' or 'converged'. Default: "" | |||
| Returns | |||
| ------- | |||
| float | |||
| The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. | |||
| """ | |||
| if not self.__median_available(): | |||
| raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') | |||
| if state == 'initialized': | |||
| return self.__best_init_sum_of_distances | |||
| if state == 'converged': | |||
| return self.__converged_sum_of_distances | |||
| return self.__sum_of_distances | |||
| def __set_default_options(self): | |||
| self.__init_type = 'RANDOM' | |||
| self.__num_random_inits = 10 | |||
| self.__desired_num_random_inits = 10 | |||
| self.__use_real_randomness = True | |||
| self.__seed = 0 | |||
| self.__refine = True | |||
| self.__time_limit_in_sec = 0 | |||
| self.__epsilon = 0.0001 | |||
| self.__max_itrs = 100 | |||
| self.__max_itrs_without_update = 3 | |||
| self.__num_inits_increase_order = 10 | |||
| self.__init_type_increase_order = 'K-MEANS++' | |||
| self.__max_itrs_increase_order = 10 | |||
| self.__print_to_stdout = 2 | |||
| def __construct_initial_medians(self, graph_ids, timer, initial_medians): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n===========================================================') | |||
| print('Constructing initial median(s).') | |||
| print('-----------------------------------------------------------') | |||
| # Compute or sample the initial median(s). | |||
| initial_medians.clear() | |||
| if self.__init_type == 'MEDOID': | |||
| self.__compute_medoid(graph_ids, timer, initial_medians) | |||
| elif self.__init_type == 'MAX': | |||
| pass # @todo | |||
| # compute_max_order_graph_(graph_ids, initial_medians) | |||
| elif self.__init_type == 'MIN': | |||
| pass # @todo | |||
| # compute_min_order_graph_(graph_ids, initial_medians) | |||
| elif self.__init_type == 'MEAN': | |||
| pass # @todo | |||
| # compute_mean_order_graph_(graph_ids, initial_medians) | |||
| else: | |||
| pass # @todo | |||
| # sample_initial_medians_(graph_ids, initial_medians) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('===========================================================') | |||
| def __compute_medoid(self, graph_ids, timer, initial_medians): | |||
| # Use method selected for initialization phase. | |||
| self.__ged_env.set_method(self.__init_method, self.__init_options) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) | |||
| # Compute the medoid. | |||
| medoid_id = graph_ids[0] | |||
| best_sum_of_distances = np.inf | |||
| for g_id in graph_ids: | |||
| if timer.expired(): | |||
| self.__state = AlgorithmState.CALLED | |||
| break | |||
| sum_of_distances = 0 | |||
| for h_id in graph_ids: | |||
| self.__ged_env.run_method(g_id, h_id) | |||
| sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) | |||
| if sum_of_distances < best_sum_of_distances: | |||
| best_sum_of_distances = sum_of_distances | |||
| medoid_id = g_id | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress.update(1) | |||
| initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n') | |||
| def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): | |||
| if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||
| if self.__state == AlgorithmState.TERMINATED: | |||
| self.__state = AlgorithmState.INITIALIZED | |||
| return True | |||
| return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||
| def __update_median(self, graphs, median): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('Updating median: ', end='') | |||
| # Store copy of the old median. | |||
| old_median = median.copy() # @todo: this is just a shallow copy. | |||
| # Update the node labels. | |||
| if self.__labeled_nodes: | |||
| self.__update_node_labels(graphs, median) | |||
| # Update the edges and their labels. | |||
| self.__update_edges(graphs, median) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('done.') | |||
| return not self.__are_graphs_equal(median, old_median) | |||
| def __update_node_labels(self, graphs, median): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('nodes ... ', end='') | |||
| # Iterate through all nodes of the median. | |||
| for i in range(0, nx.number_of_nodes(median)): | |||
| # print('i: ', i) | |||
| # Collect the labels of the substituted nodes. | |||
| node_labels = [] | |||
| for graph_id, graph in graphs.items(): | |||
| # print('graph_id: ', graph_id) | |||
| # print(self.__node_maps_from_median[graph_id]) | |||
| k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) | |||
| # print('k: ', k) | |||
| if k != np.inf: | |||
| node_labels.append(graph.nodes[k]) | |||
| # Compute the median label and update the median. | |||
| if len(node_labels) > 0: | |||
| median_label = self.__ged_env.get_median_node_label(node_labels) | |||
| if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: | |||
| nx.set_node_attributes(median, {i: median_label}) | |||
| def __update_edges(self, graphs, median): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('edges ... ', end='') | |||
| # Clear the adjacency lists of the median and reset number of edges to 0. | |||
| median_edges = list(median.edges) | |||
| for (head, tail) in median_edges: | |||
| median.remove_edge(head, tail) | |||
| # @todo: what if edge is not labeled? | |||
| # Iterate through all possible edges (i,j) of the median. | |||
| for i in range(0, nx.number_of_nodes(median)): | |||
| for j in range(i + 1, nx.number_of_nodes(median)): | |||
| # Collect the labels of the edges to which (i,j) is mapped by the node maps. | |||
| edge_labels = [] | |||
| for graph_id, graph in graphs.items(): | |||
| k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) | |||
| l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j) | |||
| if k != np.inf and l != np.inf: | |||
| if graph.has_edge(k, l): | |||
| edge_labels.append(graph.edges[(k, l)]) | |||
| # Compute the median edge label and the overall edge relabeling cost. | |||
| rel_cost = 0 | |||
| median_label = self.__ged_env.get_edge_label(1) | |||
| if median.has_edge(i, j): | |||
| median_label = median.edges[(i, j)] | |||
| if self.__labeled_edges and len(edge_labels) > 0: | |||
| new_median_label = self.__ged_env.median_edge_label(edge_labels) | |||
| if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: | |||
| median_label = new_median_label | |||
| for edge_label in edge_labels: | |||
| rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) | |||
| # Update the median. | |||
| if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): | |||
| median.add_edge(i, j, **median_label) | |||
| else: | |||
| if median.has_edge(i, j): | |||
| median.remove_edge(i, j) | |||
| def __update_node_maps(self): | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) | |||
| # Update the node maps. | |||
| node_maps_were_modified = False | |||
| for graph_id in self.__node_maps_from_median: | |||
| self.__ged_env.run_method(self.__median_id, graph_id) | |||
| if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above. | |||
| self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned. | |||
| node_maps_were_modified = True | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| progress.update(1) | |||
| # Print information about current iteration. | |||
| if self.__print_to_stdout == 2: | |||
| print('\n') | |||
| # Return true if the node maps were modified. | |||
| return node_maps_were_modified | |||
| def __improve_sum_of_distances(self, timer): | |||
| pass | |||
| def __median_available(self): | |||
| return self.__median_id != np.inf | |||
| def __get_node_image_from_map(self, node_map, node): | |||
| """ | |||
| Return ID of the node mapping of `node` in `node_map`. | |||
| Parameters | |||
| ---------- | |||
| node_map : list[tuple(int, int)] | |||
| List of node maps where the mapping node is found. | |||
| node : int | |||
| The mapping node of this node is returned | |||
| Raises | |||
| ------ | |||
| Exception | |||
| If the node with ID `node` is not contained in the source nodes of the node map. | |||
| Returns | |||
| ------- | |||
| int | |||
| ID of the mapping of `node`. | |||
| Notes | |||
| ----- | |||
| This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function. | |||
| """ | |||
| if node < len(node_map): | |||
| return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf | |||
| else: | |||
| raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.') | |||
| return np.inf | |||
| def __are_graphs_equal(self, g1, g2): | |||
| """ | |||
| Check if the two graphs are equal. | |||
| Parameters | |||
| ---------- | |||
| g1 : NetworkX graph object | |||
| Graph 1 to be compared. | |||
| g2 : NetworkX graph object | |||
| Graph 2 to be compared. | |||
| Returns | |||
| ------- | |||
| bool | |||
| True if the two graph are equal. | |||
| Notes | |||
| ----- | |||
| This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. | |||
| """ | |||
| # check original node ids. | |||
| if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: | |||
| return False | |||
| # check nodes. | |||
| nlist1 = [n for n in g1.nodes(data=True)] | |||
| nlist2 = [n for n in g2.nodes(data=True)] | |||
| if not nlist1 == nlist2: | |||
| return False | |||
| # check edges. | |||
| elist1 = [n for n in g1.edges(data=True)] | |||
| elist2 = [n for n in g2.edges(data=True)] | |||
| if not elist1 == elist2: | |||
| return False | |||
| return True | |||
| def compute_my_cost(g, h, node_map): | |||
| cost = 0.0 | |||
| for node in g.nodes: | |||
| cost += 0 | |||
| @@ -1,215 +0,0 @@ | |||
| import sys | |||
| import pathlib | |||
| import numpy as np | |||
| import networkx as nx | |||
| from gedlibpy import librariesImport, gedlibpy | |||
| sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") | |||
| import gklearn | |||
| def replace_graph_in_env(script, graph, old_id, label='median'): | |||
| """ | |||
| Replace a graph in script | |||
| If old_id is -1, add a new graph to the environnemt | |||
| """ | |||
| if(old_id > -1): | |||
| script.PyClearGraph(old_id) | |||
| new_id = script.PyAddGraph(label) | |||
| for i in graph.nodes(): | |||
| script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib | |||
| for e in graph.edges: | |||
| script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) | |||
| script.PyInitEnv() | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| return new_id | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph): | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
| nx.draw_networkx(graph,pos) | |||
| plt.show() | |||
| #compute new mappings | |||
| def update_mappings(script,median_id,listID): | |||
| med_distances = {} | |||
| med_mappings = {} | |||
| sod = 0 | |||
| for i in range(0,len(listID)): | |||
| script.PyRunMethod(median_id,listID[i]) | |||
| med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) | |||
| med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) | |||
| sod += med_distances[i] | |||
| return med_distances, med_mappings, sod | |||
| def calcul_Sij(all_mappings, all_graphs,i,j): | |||
| s_ij = 0 | |||
| for k in range(0,len(all_mappings)): | |||
| cur_graph = all_graphs[k] | |||
| cur_mapping = all_mappings[k] | |||
| size_graph = cur_graph.order() | |||
| if ((cur_mapping[i] < size_graph) and | |||
| (cur_mapping[j] < size_graph) and | |||
| (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): | |||
| s_ij += 1 | |||
| return s_ij | |||
| # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): | |||
| # from scipy.stats.mstats import gmean | |||
| # for i in median.nodes(): | |||
| # for k in listIdSet: | |||
| # vectors = [] #np.zeros((len(listIdSet),2)) | |||
| # if(k != median_id): | |||
| # phi_i = mappings[k][i] | |||
| # if(phi_i < dataset[k].order()): | |||
| # vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) | |||
| # new_labels = gmean(vectors) | |||
| # median.node[i]['x'] = str(new_labels[0]) | |||
| # median.node[i]['y'] = str(new_labels[1]) | |||
| # return median | |||
| def update_median_nodes(median,dataset,mappings): | |||
| #update node attributes | |||
| for i in median.nodes(): | |||
| nb_sub=0 | |||
| mean_label = {'x' : 0, 'y' : 0} | |||
| for k in range(0,len(mappings)): | |||
| phi_i = mappings[k][i] | |||
| if ( phi_i < dataset[k].order() ): | |||
| nb_sub += 1 | |||
| mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) | |||
| mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) | |||
| median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) | |||
| median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) | |||
| return median | |||
| def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): | |||
| #for letter high, ceir = 1.7, alpha = 0.75 | |||
| size_dataset = len(dataset) | |||
| ratio_cei_cer = cer/(cei + cer) | |||
| threshold = size_dataset*ratio_cei_cer | |||
| order_graph_median = median.order() | |||
| for i in range(0,order_graph_median): | |||
| for j in range(i+1,order_graph_median): | |||
| s_ij = calcul_Sij(mappings,dataset,i,j) | |||
| if(s_ij > threshold): | |||
| median.add_edge(i,j) | |||
| else: | |||
| if(median.has_edge(i,j)): | |||
| median.remove_edge(i,j) | |||
| return median | |||
| def compute_median(script, listID, dataset,verbose=False): | |||
| """Compute a graph median of a dataset according to an environment | |||
| Parameters | |||
| script : An gedlib initialized environnement | |||
| listID (list): a list of ID in script: encodes the dataset | |||
| dataset (list): corresponding graphs in networkX format. We assume that graph | |||
| listID[i] corresponds to dataset[i] | |||
| Returns: | |||
| A networkX graph, which is the median, with corresponding sod | |||
| """ | |||
| print(len(listID)) | |||
| median_set_index, median_set_sod = compute_median_set(script, listID) | |||
| print(median_set_index) | |||
| print(median_set_sod) | |||
| sods = [] | |||
| #Ajout median dans environnement | |||
| set_median = dataset[median_set_index].copy() | |||
| median = dataset[median_set_index].copy() | |||
| cur_med_id = replace_graph_in_env(script,median,-1) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite_max = 50 | |||
| old_sod = cur_sod * 2 | |||
| ite = 0 | |||
| epsilon = 0.001 | |||
| best_median | |||
| while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): | |||
| median = update_median_nodes(median,dataset, med_mappings) | |||
| median = update_median_edges(dataset,med_mappings,median) | |||
| cur_med_id = replace_graph_in_env(script,median,cur_med_id) | |||
| med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) | |||
| sods.append(cur_sod) | |||
| if(verbose): | |||
| print(cur_sod) | |||
| ite += 1 | |||
| return median, cur_sod, sods, set_median | |||
| draw_Letter_graph(median) | |||
| def compute_median_set(script,listID): | |||
| 'Returns the id in listID corresponding to median set' | |||
| #Calcul median set | |||
| N=len(listID) | |||
| map_id_to_index = {} | |||
| map_index_to_id = {} | |||
| for i in range(0,len(listID)): | |||
| map_id_to_index[listID[i]] = i | |||
| map_index_to_id[i] = listID[i] | |||
| distances = np.zeros((N,N)) | |||
| for i in listID: | |||
| for j in listID: | |||
| script.PyRunMethod(i,j) | |||
| distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) | |||
| median_set_index = np.argmin(np.sum(distances,0)) | |||
| sod = np.min(np.sum(distances,0)) | |||
| return median_set_index, sod | |||
| def _convertGraph(G): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||
| # y=str(attrs['attributes'][1])) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| # G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| if __name__ == "__main__": | |||
| #Chargement du dataset | |||
| gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') | |||
| gedlibpy.PySetEditCost("LETTER") | |||
| gedlibpy.PyInitEnv() | |||
| gedlibpy.PySetMethod("IPFP", "") | |||
| gedlibpy.PyInitMethod() | |||
| dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") | |||
| listID = gedlibpy.PyGetAllGraphIds() | |||
| median, sod = compute_median(gedlibpy,listID,dataset,verbose=True) | |||
| print(sod) | |||
| draw_Letter_graph(median) | |||
| @@ -1,15 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Mar 26 18:27:22 2020 | |||
| @author: ljia | |||
| """ | |||
| from gklearn.preimage.preimage_generator import PreimageGenerator | |||
| # from gklearn.utils.dataset import Dataset | |||
| class MedianPreimageGenerator(PreimageGenerator): | |||
| def __init__(self, mge, dataset): | |||
| self.__mge = mge | |||
| self.__dataset = dataset | |||
| @@ -1,108 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Mar 19 18:13:56 2020 | |||
| @author: ljia | |||
| """ | |||
| def options_string_to_options_map(options_string): | |||
| """Transforms an options string into an options map. | |||
| Parameters | |||
| ---------- | |||
| options_string : string | |||
| Options string of the form "[--<option> <arg>] [...]". | |||
| Return | |||
| ------ | |||
| options_map : dict{string : string} | |||
| Map with one key-value pair (<option>, <arg>) for each option contained in the string. | |||
| """ | |||
| if options_string == '': | |||
| return | |||
| options_map = {} | |||
| words = [] | |||
| tokenize(options_string, ' ', words) | |||
| expect_option_name = True | |||
| for word in words: | |||
| if expect_option_name: | |||
| is_opt_name, word = is_option_name(word) | |||
| if is_opt_name: | |||
| option_name = word | |||
| if option_name in options_map: | |||
| raise Exception('Multiple specification of option "' + option_name + '".') | |||
| options_map[option_name] = '' | |||
| else: | |||
| raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"') | |||
| else: | |||
| is_opt_name, word = is_option_name(word) | |||
| if is_opt_name: | |||
| raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"') | |||
| else: | |||
| options_map[option_name] = word | |||
| expect_option_name = not expect_option_name | |||
| return options_map | |||
| def tokenize(sentence, sep, words): | |||
| """Separates a sentence into words separated by sep (unless contained in single quotes). | |||
| Parameters | |||
| ---------- | |||
| sentence : string | |||
| The sentence that should be tokenized. | |||
| sep : string | |||
| The separator. Must be different from "'". | |||
| words : list[string] | |||
| The obtained words. | |||
| """ | |||
| outside_quotes = True | |||
| word_length = 0 | |||
| pos_word_start = 0 | |||
| for pos in range(0, len(sentence)): | |||
| if sentence[pos] == '\'': | |||
| if not outside_quotes and pos < len(sentence) - 1: | |||
| if sentence[pos + 1] != sep: | |||
| raise Exception('Sentence contains closing single quote which is followed by a char different from ' + sep + '.') | |||
| word_length += 1 | |||
| outside_quotes = not outside_quotes | |||
| elif outside_quotes and sentence[pos] == sep: | |||
| if word_length > 0: | |||
| words.append(sentence[pos_word_start:pos_word_start + word_length]) | |||
| pos_word_start = pos + 1 | |||
| word_length = 0 | |||
| else: | |||
| word_length += 1 | |||
| if not outside_quotes: | |||
| raise Exception('Sentence contains unbalanced single quotes.') | |||
| if word_length > 0: | |||
| words.append(sentence[pos_word_start:pos_word_start + word_length]) | |||
| def is_option_name(word): | |||
| """Checks whether a word is an option name and, if so, removes the leading dashes. | |||
| Parameters | |||
| ---------- | |||
| word : string | |||
| Word. | |||
| return | |||
| ------ | |||
| True if word is of the form "--<option>". | |||
| word : string | |||
| The word without the leading dashes. | |||
| """ | |||
| if word[0] == '\'': | |||
| word = word[1:len(word) - 2] | |||
| return False, word | |||
| if len(word) < 3: | |||
| return False, word | |||
| if word[0] == '-' and word[1] == '-' and word[2] != '-': | |||
| word = word[2:] | |||
| return True, word | |||
| return False, word | |||
| @@ -1,201 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Mar 20 10:12:15 2019 | |||
| inferring a graph grom path frequency. | |||
| @author: ljia | |||
| """ | |||
| #import numpy as np | |||
| import networkx as nx | |||
| from scipy.spatial.distance import hamming | |||
| import itertools | |||
| def SISF(K, v): | |||
| if output: | |||
| return output | |||
| else: | |||
| return 'no solution' | |||
| def SISF_M(K, v): | |||
| return output | |||
| def GIPF_tree(v_obj, K=1, alphabet=[0, 1]): | |||
| if K == 1: | |||
| n_graph = v_obj[0] + v_obj[1] | |||
| D_T, father_idx = getDynamicTable(n_graph, alphabet) | |||
| # get the vector the closest to v_obj. | |||
| if v_obj not in D_T: | |||
| print('no exact solution') | |||
| dis_lim = 1 / len(v_obj) # the possible shortest distance. | |||
| dis_min = 1.0 # minimum proportional distance | |||
| v_min = v_obj | |||
| for vc in D_T: | |||
| if vc[0] + vc[1] == n_graph: | |||
| # print(vc) | |||
| dis = hamming(vc, v_obj) | |||
| if dis < dis_min: | |||
| dis_min = dis | |||
| v_min = vc | |||
| if dis_min <= dis_lim: | |||
| break | |||
| v_obj = v_min | |||
| # obtain required graph by traceback procedure. | |||
| return getObjectGraph(v_obj, D_T, father_idx, alphabet), v_obj | |||
| def GIPF_M(K, v): | |||
| return G | |||
| def getDynamicTable(n_graph, alphabet=[0, 1]): | |||
| # init. When only one node exists. | |||
| D_T = {(1, 0, 0, 0, 0, 0): 1, (0, 1, 0, 0, 0, 0): 1, (0, 0, 1, 0, 0, 0): 0, | |||
| (0, 0, 0, 1, 0, 0): 0, (0, 0, 0, 0, 1, 0): 0, (0, 0, 0, 0, 0, 1): 0,} | |||
| D_T = [(1, 0, 0, 0, 0, 0), (0, 1, 0, 0, 0, 0)] | |||
| father_idx = [-1, -1] # index of each vector's father | |||
| # add possible vectors. | |||
| for idx, v in enumerate(D_T): | |||
| if v[0] + v[1] < n_graph: | |||
| D_T.append((v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])) | |||
| D_T.append((v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])) | |||
| D_T.append((v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])) | |||
| D_T.append((v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)) | |||
| father_idx += [idx, idx, idx, idx] | |||
| # D_T = itertools.chain([(1, 0, 0, 0, 0, 0)], [(0, 1, 0, 0, 0, 0)]) | |||
| # father_idx = itertools.chain([-1], [-1]) # index of each vector's father | |||
| # # add possible vectors. | |||
| # for idx, v in enumerate(D_T): | |||
| # if v[0] + v[1] < n_graph: | |||
| # D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])]) | |||
| # D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])]) | |||
| # D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])]) | |||
| # D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)]) | |||
| # father_idx = itertools.chain(father_idx, [idx, idx, idx, idx]) | |||
| return D_T, father_idx | |||
| def getObjectGraph(v_obj, D_T, father_idx, alphabet=[0, 1]): | |||
| g_obj = nx.Graph() | |||
| # do vector traceback. | |||
| v_tb = [list(v_obj)] # traceback vectors. | |||
| v_tb_idx = [D_T.index(v_obj)] # indices of traceback vectors. | |||
| while v_tb_idx[-1] > 1: | |||
| idx_pre = father_idx[v_tb_idx[-1]] | |||
| v_tb_idx.append(idx_pre) | |||
| v_tb.append(list(D_T[idx_pre])) | |||
| v_tb = v_tb[::-1] # reverse | |||
| # v_tb_idx = v_tb_idx[::-1] | |||
| # construct tree. | |||
| v_c = v_tb[0] # current vector. | |||
| if v_c[0] == 1: | |||
| g_obj.add_node(0, node_label=alphabet[0]) | |||
| else: | |||
| g_obj.add_node(0, node_label=alphabet[1]) | |||
| for vct in v_tb[1:]: | |||
| if vct[0] - v_c[0] == 1: | |||
| if vct[2] - v_c[2] == 2: # transfer 1 | |||
| label1 = alphabet[0] | |||
| label2 = alphabet[0] | |||
| else: # transfer 2 | |||
| label1 = alphabet[1] | |||
| label2 = alphabet[0] | |||
| else: | |||
| if vct[3] - v_c[3] == 1: # transfer 3 | |||
| label1 = alphabet[0] | |||
| label2 = alphabet[1] | |||
| else: # transfer 4 | |||
| label1 = alphabet[1] | |||
| label2 = alphabet[1] | |||
| for nd, attr in g_obj.nodes(data=True): | |||
| if attr['node_label'] == label1: | |||
| nb_node = nx.number_of_nodes(g_obj) | |||
| g_obj.add_node(nb_node, node_label=label2) | |||
| g_obj.add_edge(nd, nb_node) | |||
| break | |||
| v_c = vct | |||
| return g_obj | |||
| import random | |||
| def hierarchy_pos(G, root=None, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5): | |||
| ''' | |||
| From Joel's answer at https://stackoverflow.com/a/29597209/2966723. | |||
| Licensed under Creative Commons Attribution-Share Alike | |||
| If the graph is a tree this will return the positions to plot this in a | |||
| hierarchical layout. | |||
| G: the graph (must be a tree) | |||
| root: the root node of current branch | |||
| - if the tree is directed and this is not given, | |||
| the root will be found and used | |||
| - if the tree is directed and this is given, then | |||
| the positions will be just for the descendants of this node. | |||
| - if the tree is undirected and not given, | |||
| then a random choice will be used. | |||
| width: horizontal space allocated for this branch - avoids overlap with other branches | |||
| vert_gap: gap between levels of hierarchy | |||
| vert_loc: vertical location of root | |||
| xcenter: horizontal location of root | |||
| ''' | |||
| if not nx.is_tree(G): | |||
| raise TypeError('cannot use hierarchy_pos on a graph that is not a tree') | |||
| if root is None: | |||
| if isinstance(G, nx.DiGraph): | |||
| root = next(iter(nx.topological_sort(G))) #allows back compatibility with nx version 1.11 | |||
| else: | |||
| root = random.choice(list(G.nodes)) | |||
| def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None): | |||
| ''' | |||
| see hierarchy_pos docstring for most arguments | |||
| pos: a dict saying where all nodes go if they have been assigned | |||
| parent: parent of this branch. - only affects it if non-directed | |||
| ''' | |||
| if pos is None: | |||
| pos = {root:(xcenter,vert_loc)} | |||
| else: | |||
| pos[root] = (xcenter, vert_loc) | |||
| children = list(G.neighbors(root)) | |||
| if not isinstance(G, nx.DiGraph) and parent is not None: | |||
| children.remove(parent) | |||
| if len(children)!=0: | |||
| dx = width/len(children) | |||
| nextx = xcenter - width/2 - dx/2 | |||
| for child in children: | |||
| nextx += dx | |||
| pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap, | |||
| vert_loc = vert_loc-vert_gap, xcenter=nextx, | |||
| pos=pos, parent = root) | |||
| return pos | |||
| return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter) | |||
| if __name__ == '__main__': | |||
| v_obj = (6, 4, 10, 3, 3, 2) | |||
| # v_obj = (6, 5, 10, 3, 3, 2) | |||
| tree_obj, v_obj = GIPF_tree(v_obj) | |||
| print('One closest vector is', v_obj) | |||
| # plot | |||
| pos = hierarchy_pos(tree_obj, 0) | |||
| node_labels = nx.get_node_attributes(tree_obj, 'node_label') | |||
| nx.draw(tree_obj, pos=pos, labels=node_labels, with_labels=True) | |||
| @@ -1,12 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Mar 26 18:26:36 2020 | |||
| @author: ljia | |||
| """ | |||
| class PreimageGenerator(object): | |||
| def __init__(self): | |||
| pass | |||
| @@ -1,705 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Apr 30 17:07:43 2019 | |||
| A graph pre-image method combining iterative pre-image method in reference [1] | |||
| and the iterative alternate minimizations (IAM) in reference [2]. | |||
| @author: ljia | |||
| @references: | |||
| [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph | |||
| pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. | |||
| [2] Generalized median graph via iterative alternate minimization. | |||
| """ | |||
| import sys | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import random | |||
| from iam import iam_upgraded | |||
| from utils import dis_gstar, compute_kernel | |||
| def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, | |||
| gkernel, epsilon=0.001, InitIAMWithAllDk=False, | |||
| params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | |||
| 'ite_max': 50, 'epsilon': 0.001, | |||
| 'removeNodes': True, 'connected': False}, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | |||
| 'edit_cost_constant': [], 'stabilizer': 'min', | |||
| 'repeat': 50}): | |||
| """This function constructs graph pre-image by the iterative pre-image | |||
| framework in reference [1], algorithm 1, where the step of generating new | |||
| graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
| notes | |||
| ----- | |||
| Every time a set of n better graphs is acquired, their distances in kernel space are | |||
| compared with the k nearest ones, and the k nearest distances from the k+n | |||
| distances will be used as the new ones. | |||
| """ | |||
| # compute k nearest neighbors of phi in DN. | |||
| dis_all = [] # distance between g_star and each graph. | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
| dis_all.append(dtemp) | |||
| # sort | |||
| sort_idx = np.argsort(dis_all) | |||
| dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
| nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist()) | |||
| ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN | |||
| if dis_k[0] == 0: # the exact pre-image. | |||
| print('The exact pre-image is found from the input dataset.') | |||
| return 0, ghat_list, 0, 0 | |||
| dhat = dis_k[0] # the nearest distance | |||
| # for g in ghat_list: | |||
| # draw_Letter_graph(g) | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # for gi in Gk: | |||
| # nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True) | |||
| ## nx.draw_networkx(gi) | |||
| # plt.show() | |||
| ## draw_Letter_graph(g) | |||
| # print(gi.nodes(data=True)) | |||
| # print(gi.edges(data=True)) | |||
| # i = 1 | |||
| r = 0 | |||
| itr_total = 0 | |||
| dis_of_each_itr = [dhat] | |||
| found = False | |||
| nb_updated = 0 | |||
| nb_updated_k = 0 | |||
| while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon: | |||
| print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-') | |||
| print('Current preimage iteration =', r) | |||
| print('Total preimage iteration =', itr_total, '\n') | |||
| found = False | |||
| Gn_nearest_median = [g.copy() for g in Gk] | |||
| if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM. | |||
| ghat_new_list = [] | |||
| for g_tmp in Gk: | |||
| Gn_nearest_init = [g_tmp.copy()] | |||
| ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, | |||
| Gn_nearest_init, params_ged=params_ged, **params_iam) | |||
| ghat_new_list += ghat_new_list_tmp | |||
| else: # only the best graph in D_k is used to initialize IAM. | |||
| Gn_nearest_init = [g.copy() for g in Gk] | |||
| ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, | |||
| params_ged=params_ged, **params_iam) | |||
| # for g in g_tmp_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # draw_Letter_graph(g) | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(ghat_new_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||
| len(ghat_new_list) + len(Gn_median) + 1), | |||
| alpha, knew, withterm3=False)) | |||
| for idx_g, ghat_new in enumerate(ghat_new_list): | |||
| dhat_new = dhat_new_list[idx_g] | |||
| # if the new distance is smaller than the max of D_k. | |||
| if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: | |||
| # check if the new distance is the same as one in D_k. | |||
| is_duplicate = False | |||
| for dis_tmp in dis_k[1:-1]: | |||
| if np.abs(dhat_new - dis_tmp) < epsilon: | |||
| is_duplicate = True | |||
| print('IAM: duplicate k nearest graph generated.') | |||
| break | |||
| if not is_duplicate: | |||
| if np.abs(dhat_new - dhat) < epsilon: | |||
| print('IAM: I am equal!') | |||
| # dhat = dhat_new | |||
| # ghat_list = [ghat_new.copy()] | |||
| else: | |||
| print('IAM: we got better k nearest neighbors!') | |||
| nb_updated_k += 1 | |||
| print('the k nearest neighbors are updated', | |||
| nb_updated_k, 'times.') | |||
| dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance. | |||
| Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph. | |||
| sort_idx = np.argsort(dis_k) | |||
| dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
| Gk = [Gk[idx] for idx in sort_idx[0:k]] | |||
| if dhat_new < dhat: | |||
| print('IAM: I have smaller distance!') | |||
| print(str(dhat) + '->' + str(dhat_new)) | |||
| dhat = dhat_new | |||
| ghat_list = [Gk[0].copy()] | |||
| r = 0 | |||
| nb_updated += 1 | |||
| print('the graph is updated', nb_updated, 'times.') | |||
| nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), | |||
| with_labels=True) | |||
| ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") | |||
| plt.show() | |||
| found = True | |||
| if not found: | |||
| r += 1 | |||
| dis_of_each_itr.append(dhat) | |||
| itr_total += 1 | |||
| print('\nthe k shortest distances are', dis_k) | |||
| print('the shortest distances for previous iterations are', dis_of_each_itr) | |||
| print('\n\nthe graph is updated', nb_updated, 'times.') | |||
| print('\nthe k nearest neighbors are updated', nb_updated_k, 'times.') | |||
| print('distances in kernel space:', dis_of_each_itr, '\n') | |||
| return dhat, ghat_list, dis_of_each_itr[-1], nb_updated, nb_updated_k | |||
| def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, | |||
| l_max, gkernel, epsilon=0.001, | |||
| InitIAMWithAllDk=False, InitRandomWithAllDk=True, | |||
| params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | |||
| 'ite_max': 50, 'epsilon': 0.001, | |||
| 'removeNodes': True, 'connected': False}, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', | |||
| 'method': 'IPFP', 'edit_cost_constant': [], | |||
| 'stabilizer': 'min', 'repeat': 50}): | |||
| """This function constructs graph pre-image by the iterative pre-image | |||
| framework in reference [1], algorithm 1, where new graphs are generated | |||
| randomly and by the IAM algorithm in reference [2]. | |||
| notes | |||
| ----- | |||
| Every time a set of n better graphs is acquired, their distances in kernel space are | |||
| compared with the k nearest ones, and the k nearest distances from the k+n | |||
| distances will be used as the new ones. | |||
| """ | |||
| Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init] | |||
| # compute k nearest neighbors of phi in DN. | |||
| dis_all = [] # distance between g_star and each graph. | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
| dis_all.append(dtemp) | |||
| # sort | |||
| sort_idx = np.argsort(dis_all) | |||
| dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
| nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist()) | |||
| ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of psi in DN | |||
| if dis_k[0] == 0: # the exact pre-image. | |||
| print('The exact pre-image is found from the input dataset.') | |||
| return 0, ghat_list, 0, 0 | |||
| dhat = dis_k[0] # the nearest distance | |||
| # for g in ghat_list: | |||
| # draw_Letter_graph(g) | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # for gi in Gk: | |||
| # nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True) | |||
| ## nx.draw_networkx(gi) | |||
| # plt.show() | |||
| ## draw_Letter_graph(g) | |||
| # print(gi.nodes(data=True)) | |||
| # print(gi.edges(data=True)) | |||
| r = 0 | |||
| itr_total = 0 | |||
| dis_of_each_itr = [dhat] | |||
| nb_updated_iam = 0 | |||
| nb_updated_k_iam = 0 | |||
| nb_updated_random = 0 | |||
| nb_updated_k_random = 0 | |||
| # is_iam_duplicate = False | |||
| while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon: | |||
| print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-') | |||
| print('Current preimage iteration =', r) | |||
| print('Total preimage iteration =', itr_total, '\n') | |||
| found_iam = False | |||
| Gn_nearest_median = [g.copy() for g in Gk] | |||
| if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM. | |||
| ghat_new_list = [] | |||
| for g_tmp in Gk: | |||
| Gn_nearest_init = [g_tmp.copy()] | |||
| ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, | |||
| Gn_nearest_init, params_ged=params_ged, **params_iam) | |||
| ghat_new_list += ghat_new_list_tmp | |||
| else: # only the best graph in D_k is used to initialize IAM. | |||
| Gn_nearest_init = [g.copy() for g in Gk] | |||
| ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, | |||
| params_ged=params_ged, **params_iam) | |||
| # for g in g_tmp_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # draw_Letter_graph(g) | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(ghat_new_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||
| len(ghat_new_list) + len(Gn_median) + 1), | |||
| alpha, knew, withterm3=False)) | |||
| # find the new k nearest graphs. | |||
| for idx_g, ghat_new in enumerate(ghat_new_list): | |||
| dhat_new = dhat_new_list[idx_g] | |||
| # if the new distance is smaller than the max of D_k. | |||
| if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: | |||
| # check if the new distance is the same as one in D_k. | |||
| is_duplicate = False | |||
| for dis_tmp in dis_k[1:-1]: | |||
| if np.abs(dhat_new - dis_tmp) < epsilon: | |||
| is_duplicate = True | |||
| print('IAM: duplicate k nearest graph generated.') | |||
| break | |||
| if not is_duplicate: | |||
| if np.abs(dhat_new - dhat) < epsilon: | |||
| print('IAM: I am equal!') | |||
| # dhat = dhat_new | |||
| # ghat_list = [ghat_new.copy()] | |||
| else: | |||
| print('IAM: we got better k nearest neighbors!') | |||
| nb_updated_k_iam += 1 | |||
| print('the k nearest neighbors are updated', | |||
| nb_updated_k_iam, 'times.') | |||
| dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance. | |||
| Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph. | |||
| sort_idx = np.argsort(dis_k) | |||
| dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
| Gk = [Gk[idx] for idx in sort_idx[0:k]] | |||
| if dhat_new < dhat: | |||
| print('IAM: I have smaller distance!') | |||
| print(str(dhat) + '->' + str(dhat_new)) | |||
| dhat = dhat_new | |||
| ghat_list = [Gk[0].copy()] | |||
| r = 0 | |||
| nb_updated_iam += 1 | |||
| print('the graph is updated by IAM', nb_updated_iam, | |||
| 'times.') | |||
| nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), | |||
| with_labels=True) | |||
| ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") | |||
| plt.show() | |||
| found_iam = True | |||
| # when new distance is not smaller than the max of D_k, use random generation. | |||
| if not found_iam: | |||
| print('Distance not better, switching to random generation now.') | |||
| print(str(dhat) + '->' + str(dhat_new)) | |||
| if InitRandomWithAllDk: # use all k nearest graphs as the initials. | |||
| init_list = [g_init.copy() for g_init in Gk] | |||
| else: # use just the nearest graph as the initial. | |||
| init_list = [Gk[0].copy()] | |||
| # number of edges to be changed. | |||
| if len(init_list) == 1: | |||
| # @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1. | |||
| # fdgs = dhat_new | |||
| fdgs = nb_updated_random + 1 | |||
| if fdgs < 1: | |||
| fdgs = 1 | |||
| fdgs = int(np.ceil(np.log(fdgs))) | |||
| if fdgs < 1: | |||
| fdgs += 1 | |||
| # fdgs = nb_updated_random + 1 # @todo: | |||
| fdgs_list = [fdgs] | |||
| else: | |||
| # @todo what if the log is negetive? how to choose alpha (scalar)? | |||
| fdgs_list = np.array(dis_k[:]) | |||
| if np.min(fdgs_list) < 1: | |||
| fdgs_list /= dis_k[0] | |||
| fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] | |||
| if np.min(fdgs_list) < 1: | |||
| fdgs_list = np.array(fdgs_list) + 1 | |||
| l = 0 | |||
| found_random = False | |||
| while l < l_max and not found_random: | |||
| for idx_g, g_tmp in enumerate(init_list): | |||
| # add and delete edges. | |||
| ghat_new = nx.convert_node_labels_to_integers(g_tmp.copy()) | |||
| # @todo: should we use just half of the adjacency matrix for undirected graphs? | |||
| nb_vpairs = nx.number_of_nodes(ghat_new) * (nx.number_of_nodes(ghat_new) - 1) | |||
| np.random.seed() | |||
| # which edges to change. | |||
| # @todo: what if fdgs is bigger than nb_vpairs? | |||
| idx_change = random.sample(range(nb_vpairs), fdgs_list[idx_g] if | |||
| fdgs_list[idx_g] < nb_vpairs else nb_vpairs) | |||
| # idx_change = np.random.randint(0, nx.number_of_nodes(gs) * | |||
| # (nx.number_of_nodes(gs) - 1), fdgs) | |||
| for item in idx_change: | |||
| node1 = int(item / (nx.number_of_nodes(ghat_new) - 1)) | |||
| node2 = (item - node1 * (nx.number_of_nodes(ghat_new) - 1)) | |||
| if node2 >= node1: # skip the self pair. | |||
| node2 += 1 | |||
| # @todo: is the randomness correct? | |||
| if not ghat_new.has_edge(node1, node2): | |||
| ghat_new.add_edge(node1, node2) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(ghat_new) | |||
| # plt.show() | |||
| else: | |||
| ghat_new.remove_edge(node1, node2) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(ghat_new) | |||
| # plt.show() | |||
| # nx.draw_networkx(ghat_new) | |||
| # plt.show() | |||
| # compute distance between \psi and the new generated graph. | |||
| knew = compute_kernel([ghat_new] + Gn_median, gkernel, verbose=False) | |||
| dhat_new = dis_gstar(0, range(1, len(Gn_median) + 1), | |||
| alpha, knew, withterm3=False) | |||
| # @todo: the new distance is smaller or also equal? | |||
| if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: | |||
| # check if the new distance is the same as one in D_k. | |||
| is_duplicate = False | |||
| for dis_tmp in dis_k[1:-1]: | |||
| if np.abs(dhat_new - dis_tmp) < epsilon: | |||
| is_duplicate = True | |||
| print('Random: duplicate k nearest graph generated.') | |||
| break | |||
| if not is_duplicate: | |||
| if np.abs(dhat_new - dhat) < epsilon: | |||
| print('Random: I am equal!') | |||
| # dhat = dhat_new | |||
| # ghat_list = [ghat_new.copy()] | |||
| else: | |||
| print('Random: we got better k nearest neighbors!') | |||
| print('l =', str(l)) | |||
| nb_updated_k_random += 1 | |||
| print('the k nearest neighbors are updated by random generation', | |||
| nb_updated_k_random, 'times.') | |||
| dis_k = [dhat_new] + dis_k # add the new nearest distances. | |||
| Gk = [ghat_new.copy()] + Gk # add the corresponding graphs. | |||
| sort_idx = np.argsort(dis_k) | |||
| dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
| Gk = [Gk[idx] for idx in sort_idx[0:k]] | |||
| if dhat_new < dhat: | |||
| print('\nRandom: I am smaller!') | |||
| print('l =', str(l)) | |||
| print(dhat, '->', dhat_new) | |||
| dhat = dhat_new | |||
| ghat_list = [ghat_new.copy()] | |||
| r = 0 | |||
| nb_updated_random += 1 | |||
| print('the graph is updated by random generation', | |||
| nb_updated_random, 'times.') | |||
| nx.draw(ghat_new, labels=nx.get_node_attributes(ghat_new, 'atom'), | |||
| with_labels=True) | |||
| ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") | |||
| plt.show() | |||
| found_random = True | |||
| break | |||
| l += 1 | |||
| if not found_random: # l == l_max: | |||
| r += 1 | |||
| dis_of_each_itr.append(dhat) | |||
| itr_total += 1 | |||
| print('\nthe k shortest distances are', dis_k) | |||
| print('the shortest distances for previous iterations are', dis_of_each_itr) | |||
| print('\n\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation', | |||
| nb_updated_random, 'times.') | |||
| print('\nthe k nearest neighbors are updated by IAM', nb_updated_k_iam, | |||
| 'times, and by random generation', nb_updated_k_random, 'times.') | |||
| print('distances in kernel space:', dis_of_each_itr, '\n') | |||
| return dhat, ghat_list, dis_of_each_itr[-1], \ | |||
| nb_updated_iam, nb_updated_random, nb_updated_k_iam, nb_updated_k_random | |||
| ############################################################################### | |||
| # Old implementations. | |||
| #def gk_iam(Gn, alpha): | |||
| # """This function constructs graph pre-image by the iterative pre-image | |||
| # framework in reference [1], algorithm 1, where the step of generating new | |||
| # graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
| # | |||
| # notes | |||
| # ----- | |||
| # Every time a better graph is acquired, the older one is replaced by it. | |||
| # """ | |||
| # pass | |||
| # # compute k nearest neighbors of phi in DN. | |||
| # dis_list = [] # distance between g_star and each graph. | |||
| # for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||
| # dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
| # k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||
| # (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
| # k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
| # dis_list.append(dtemp) | |||
| # | |||
| # # sort | |||
| # sort_idx = np.argsort(dis_list) | |||
| # dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||
| # g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||
| # if dis_gs[0] == 0: # the exact pre-image. | |||
| # print('The exact pre-image is found from the input dataset.') | |||
| # return 0, g0hat | |||
| # dhat = dis_gs[0] # the nearest distance | |||
| # Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # gihat_list = [] | |||
| # | |||
| ## i = 1 | |||
| # r = 1 | |||
| # while r < r_max: | |||
| # print('r =', r) | |||
| ## found = False | |||
| # Gs_nearest = Gk + gihat_list | |||
| # g_tmp = iam(Gs_nearest) | |||
| # | |||
| # # compute distance between \psi and the new generated graph. | |||
| # knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||
| # p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
| # n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
| # dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||
| # knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||
| # (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
| # k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
| # if dnew <= dhat: # the new distance is smaller | |||
| # print('I am smaller!') | |||
| # dhat = dnew | |||
| # g_new = g_tmp.copy() # found better graph. | |||
| # gihat_list = [g_new] | |||
| # dis_gs.append(dhat) | |||
| # r = 0 | |||
| # else: | |||
| # r += 1 | |||
| # | |||
| # ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||
| # | |||
| # return dhat, ghat | |||
| #def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max): | |||
| # """This function constructs graph pre-image by the iterative pre-image | |||
| # framework in reference [1], algorithm 1, where the step of generating new | |||
| # graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
| # | |||
| # notes | |||
| # ----- | |||
| # Every time a better graph is acquired, its distance in kernel space is | |||
| # compared with the k nearest ones, and the k nearest distances from the k+1 | |||
| # distances will be used as the new ones. | |||
| # """ | |||
| # # compute k nearest neighbors of phi in DN. | |||
| # dis_list = [] # distance between g_star and each graph. | |||
| # for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||
| # dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix) | |||
| ## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
| ## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * | |||
| ## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * | |||
| ## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6]) | |||
| # dis_list.append(dtemp) | |||
| # | |||
| # # sort | |||
| # sort_idx = np.argsort(dis_list) | |||
| # dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
| # g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||
| # if dis_gs[0] == 0: # the exact pre-image. | |||
| # print('The exact pre-image is found from the input dataset.') | |||
| # return 0, g0hat | |||
| # dhat = dis_gs[0] # the nearest distance | |||
| # ghat = g0hat.copy() | |||
| # Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # for gi in Gk: | |||
| # nx.draw_networkx(gi) | |||
| # plt.show() | |||
| # print(gi.nodes(data=True)) | |||
| # print(gi.edges(data=True)) | |||
| # Gs_nearest = Gk.copy() | |||
| ## gihat_list = [] | |||
| # | |||
| ## i = 1 | |||
| # r = 1 | |||
| # while r < r_max: | |||
| # print('r =', r) | |||
| ## found = False | |||
| ## Gs_nearest = Gk + gihat_list | |||
| ## g_tmp = iam(Gs_nearest) | |||
| # g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1) | |||
| # nx.draw_networkx(g_tmp) | |||
| # plt.show() | |||
| # print(g_tmp.nodes(data=True)) | |||
| # print(g_tmp.edges(data=True)) | |||
| # | |||
| # # compute distance between \psi and the new generated graph. | |||
| # gi_list = [Gn[i] for i in idx_gi] | |||
| # knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False) | |||
| # dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew) | |||
| # | |||
| ## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * | |||
| ## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * | |||
| ## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * | |||
| ## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) | |||
| # if dnew <= dhat and g_tmp != ghat: # the new distance is smaller | |||
| # print('I am smaller!') | |||
| # print(str(dhat) + '->' + str(dnew)) | |||
| ## nx.draw_networkx(ghat) | |||
| ## plt.show() | |||
| ## print('->') | |||
| ## nx.draw_networkx(g_tmp) | |||
| ## plt.show() | |||
| # | |||
| # dhat = dnew | |||
| # g_new = g_tmp.copy() # found better graph. | |||
| # ghat = g_tmp.copy() | |||
| # dis_gs.append(dhat) # add the new nearest distance. | |||
| # Gs_nearest.append(g_new) # add the corresponding graph. | |||
| # sort_idx = np.argsort(dis_gs) | |||
| # dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
| # Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] | |||
| # r = 0 | |||
| # else: | |||
| # r += 1 | |||
| # | |||
| # return dhat, ghat | |||
| #def gk_iam_nearest_multi(Gn, alpha, idx_gi, Kmatrix, k, r_max): | |||
| # """This function constructs graph pre-image by the iterative pre-image | |||
| # framework in reference [1], algorithm 1, where the step of generating new | |||
| # graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
| # | |||
| # notes | |||
| # ----- | |||
| # Every time a set of n better graphs is acquired, their distances in kernel space are | |||
| # compared with the k nearest ones, and the k nearest distances from the k+n | |||
| # distances will be used as the new ones. | |||
| # """ | |||
| # Gn_median = [Gn[idx].copy() for idx in idx_gi] | |||
| # # compute k nearest neighbors of phi in DN. | |||
| # dis_list = [] # distance between g_star and each graph. | |||
| # for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||
| # dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix) | |||
| ## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
| ## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * | |||
| ## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * | |||
| ## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6]) | |||
| # dis_list.append(dtemp) | |||
| # | |||
| # # sort | |||
| # sort_idx = np.argsort(dis_list) | |||
| # dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
| # nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||
| # g0hat_list = [Gn[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN | |||
| # if dis_gs[0] == 0: # the exact pre-image. | |||
| # print('The exact pre-image is found from the input dataset.') | |||
| # return 0, g0hat_list | |||
| # dhat = dis_gs[0] # the nearest distance | |||
| # ghat_list = [g.copy() for g in g0hat_list] | |||
| # for g in ghat_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # for gi in Gk: | |||
| # nx.draw_networkx(gi) | |||
| # plt.show() | |||
| # print(gi.nodes(data=True)) | |||
| # print(gi.edges(data=True)) | |||
| # Gs_nearest = Gk.copy() | |||
| ## gihat_list = [] | |||
| # | |||
| ## i = 1 | |||
| # r = 1 | |||
| # while r < r_max: | |||
| # print('r =', r) | |||
| ## found = False | |||
| ## Gs_nearest = Gk + gihat_list | |||
| ## g_tmp = iam(Gs_nearest) | |||
| # g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( | |||
| # Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1) | |||
| # for g in g_tmp_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # | |||
| # # compute distance between \psi and the new generated graphs. | |||
| # gi_list = [Gn[i] for i in idx_gi] | |||
| # knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False) | |||
| # dnew_list = [] | |||
| # for idx, g_tmp in enumerate(g_tmp_list): | |||
| # dnew_list.append(dis_gstar(idx, range(len(g_tmp_list), | |||
| # len(g_tmp_list) + len(gi_list) + 1), alpha, knew)) | |||
| # | |||
| ## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * | |||
| ## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * | |||
| ## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * | |||
| ## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) | |||
| # | |||
| # # find the new k nearest graphs. | |||
| # dis_gs = dnew_list + dis_gs # add the new nearest distances. | |||
| # Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs. | |||
| # sort_idx = np.argsort(dis_gs) | |||
| # if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0: | |||
| # print('We got better k nearest neighbors! Hurray!') | |||
| # dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
| # print(dis_gs[-1]) | |||
| # Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] | |||
| # nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||
| # if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0: | |||
| # print('I have smaller or equal distance!') | |||
| # dhat = dis_gs[0] | |||
| # print(str(dhat) + '->' + str(dhat)) | |||
| # idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist() | |||
| # ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list] | |||
| # for g in ghat_list: | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # r = 0 | |||
| # else: | |||
| # r += 1 | |||
| # | |||
| # return dhat, ghat_list | |||
| @@ -1,309 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Mar 6 16:03:11 2019 | |||
| pre-image | |||
| @author: ljia | |||
| """ | |||
| import sys | |||
| import numpy as np | |||
| import random | |||
| from tqdm import tqdm | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| from gklearn.preimage.utils import compute_kernel, dis_gstar | |||
| def preimage_random(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel): | |||
| Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init] | |||
| # compute k nearest neighbors of phi in DN. | |||
| dis_list = [] # distance between g_star and each graph. | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
| dis_list.append(dtemp) | |||
| # print(np.max(dis_list)) | |||
| # print(np.min(dis_list)) | |||
| # print(np.min([item for item in dis_list if item != 0])) | |||
| # print(np.mean(dis_list)) | |||
| # sort | |||
| sort_idx = np.argsort(dis_list) | |||
| dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
| nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||
| g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN | |||
| if dis_gs[0] == 0: # the exact pre-image. | |||
| print('The exact pre-image is found from the input dataset.') | |||
| return 0, g0hat_list[0], 0 | |||
| dhat = dis_gs[0] # the nearest distance | |||
| # ghat_list = [g.copy() for g in g0hat_list] | |||
| # for g in ghat_list: | |||
| # draw_Letter_graph(g) | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| # for gi in Gk: | |||
| ## nx.draw_networkx(gi) | |||
| ## plt.show() | |||
| # draw_Letter_graph(g) | |||
| # print(gi.nodes(data=True)) | |||
| # print(gi.edges(data=True)) | |||
| Gs_nearest = [g.copy() for g in Gk] | |||
| gihat_list = [] | |||
| dihat_list = [] | |||
| # i = 1 | |||
| r = 0 | |||
| # sod_list = [dhat] | |||
| # found = False | |||
| dis_of_each_itr = [dhat] | |||
| nb_updated = 0 | |||
| g_best = [] | |||
| while r < r_max: | |||
| print('\nr =', r) | |||
| print('itr for gk =', nb_updated, '\n') | |||
| found = False | |||
| dis_bests = dis_gs + dihat_list | |||
| # @todo what if the log is negetive? how to choose alpha (scalar)? | |||
| fdgs_list = np.array(dis_bests) | |||
| if np.min(fdgs_list) < 1: | |||
| fdgs_list /= np.min(dis_bests) | |||
| fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] | |||
| if np.min(fdgs_list) < 1: | |||
| fdgs_list = np.array(fdgs_list) + 1 | |||
| for ig, gs in enumerate(Gs_nearest + gihat_list): | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| for trail in range(0, l): | |||
| # for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout): | |||
| # add and delete edges. | |||
| gtemp = gs.copy() | |||
| np.random.seed() | |||
| # which edges to change. | |||
| # @todo: should we use just half of the adjacency matrix for undirected graphs? | |||
| nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1) | |||
| # @todo: what if fdgs is bigger than nb_vpairs? | |||
| idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if | |||
| fdgs_list[ig] < nb_vpairs else nb_vpairs) | |||
| # idx_change = np.random.randint(0, nx.number_of_nodes(gs) * | |||
| # (nx.number_of_nodes(gs) - 1), fdgs) | |||
| for item in idx_change: | |||
| node1 = int(item / (nx.number_of_nodes(gs) - 1)) | |||
| node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) | |||
| if node2 >= node1: # skip the self pair. | |||
| node2 += 1 | |||
| # @todo: is the randomness correct? | |||
| if not gtemp.has_edge(node1, node2): | |||
| gtemp.add_edge(node1, node2) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| else: | |||
| gtemp.remove_edge(node1, node2) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| # compute distance between \psi and the new generated graph. | |||
| # knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, | |||
| # p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
| # n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
| knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False) | |||
| dnew = dis_gstar(0, range(1, len(Gn_median) + 1), alpha, knew, | |||
| withterm3=False) | |||
| if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||
| if dnew < dhat: | |||
| print('\nI am smaller!') | |||
| print('ig =', str(ig), ', l =', str(trail)) | |||
| print(dhat, '->', dnew) | |||
| nb_updated += 1 | |||
| elif dnew == dhat: | |||
| print('I am equal!') | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| # print(gtemp.nodes(data=True)) | |||
| # print(gtemp.edges(data=True)) | |||
| dhat = dnew | |||
| gnew = gtemp.copy() | |||
| found = True # found better graph. | |||
| if found: | |||
| r = 0 | |||
| gihat_list = [gnew] | |||
| dihat_list = [dhat] | |||
| else: | |||
| r += 1 | |||
| dis_of_each_itr.append(dhat) | |||
| print('the shortest distances for previous iterations are', dis_of_each_itr) | |||
| # dis_best.append(dhat) | |||
| g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) | |||
| print('distances in kernel space:', dis_of_each_itr, '\n') | |||
| return dhat, g_best, nb_updated | |||
| # return 0, 0, 0 | |||
| if __name__ == '__main__': | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||
| # 'extra_params': {}} | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| # 'extra_params': {}} # node symb | |||
| DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| #DN = DN[0:10] | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 3 # 10 # iteration limit. | |||
| l = 500 | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| #alpha_range = np.linspace(0.1, 0.9, 9) | |||
| k = 10 # 5 # k nearest neighbors | |||
| # randomly select two molecules | |||
| #np.random.seed(1) | |||
| #idx1, idx2 = np.random.randint(0, len(DN), 2) | |||
| #g1 = DN[idx1] | |||
| #g2 = DN[idx2] | |||
| idx1 = 0 | |||
| idx2 = 6 | |||
| g1 = DN[idx1] | |||
| g2 = DN[idx2] | |||
| # compute | |||
| k_list = [] # kernel between each graph and itself. | |||
| k_g1_list = [] # kernel between each graph and g1 | |||
| k_g2_list = [] # kernel between each graph and g2 | |||
| for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout): | |||
| # ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, | |||
| # p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
| # n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
| ktemp = compute_kernel([g, g1, g2], 'untilhpathkernel', verbose=False) | |||
| k_list.append(ktemp[0, 0]) | |||
| k_g1_list.append(ktemp[0, 1]) | |||
| k_g2_list.append(ktemp[0, 2]) | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| # compute k nearest neighbors of phi in DN. | |||
| dis_list = [] # distance between g_star and each graph. | |||
| for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout): | |||
| dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
| k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||
| (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
| k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
| dis_list.append(np.sqrt(dtemp)) | |||
| # sort | |||
| sort_idx = np.argsort(dis_list) | |||
| dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||
| g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN | |||
| if dis_gs[0] == 0: # the exact pre-image. | |||
| print('The exact pre-image is found from the input dataset.') | |||
| g_pimg = g0hat | |||
| break | |||
| dhat = dis_gs[0] # the nearest distance | |||
| Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||
| gihat_list = [] | |||
| i = 1 | |||
| r = 1 | |||
| while r < r_max: | |||
| print('r =', r) | |||
| found = False | |||
| for ig, gs in enumerate(Dk + gihat_list): | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # @todo what if the log is negetive? | |||
| fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) | |||
| for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout): | |||
| # add and delete edges. | |||
| gtemp = gs.copy() | |||
| np.random.seed() | |||
| # which edges to change. | |||
| # @todo: should we use just half of the adjacency matrix for undirected graphs? | |||
| nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1) | |||
| # @todo: what if fdgs is bigger than nb_vpairs? | |||
| idx_change = random.sample(range(nb_vpairs), fdgs if fdgs < nb_vpairs else nb_vpairs) | |||
| # idx_change = np.random.randint(0, nx.number_of_nodes(gs) * | |||
| # (nx.number_of_nodes(gs) - 1), fdgs) | |||
| for item in idx_change: | |||
| node1 = int(item / (nx.number_of_nodes(gs) - 1)) | |||
| node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) | |||
| if node2 >= node1: # skip the self pair. | |||
| node2 += 1 | |||
| # @todo: is the randomness correct? | |||
| if not gtemp.has_edge(node1, node2): | |||
| # @todo: how to update the bond_type? 0 or 1? | |||
| gtemp.add_edges_from([(node1, node2, {'bond_type': 1})]) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| else: | |||
| gtemp.remove_edge(node1, node2) | |||
| # nx.draw_networkx(gs) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| # nx.draw_networkx(gtemp) | |||
| # plt.show() | |||
| # compute distance between phi and the new generated graph. | |||
| # knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, | |||
| # p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
| # n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
| knew = compute_kernel([gtemp, g1, g2], 'untilhpathkernel', verbose=False) | |||
| dnew = np.sqrt(knew[0, 0] - 2 * (alpha * knew[0, 1] + (1 - alpha) * | |||
| knew[0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||
| (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
| k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])) | |||
| if dnew < dhat: # @todo: the new distance is smaller or also equal? | |||
| print('I am smaller!') | |||
| print(dhat, '->', dnew) | |||
| nx.draw_networkx(gtemp) | |||
| plt.show() | |||
| print(gtemp.nodes(data=True)) | |||
| print(gtemp.edges(data=True)) | |||
| dhat = dnew | |||
| gnew = gtemp.copy() | |||
| found = True # found better graph. | |||
| r = 0 | |||
| elif dnew == dhat: | |||
| print('I am equal!') | |||
| if found: | |||
| gihat_list = [gnew] | |||
| dis_gs.append(dhat) | |||
| else: | |||
| r += 1 | |||
| dis_best.append(dhat) | |||
| g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-image is') | |||
| nx.draw_networkx(g_best[idx]) | |||
| plt.show() | |||
| @@ -1,122 +0,0 @@ | |||
| elif opt_name == 'random-inits': | |||
| try: | |||
| num_random_inits_ = std::stoul(opt_val) | |||
| desired_num_random_inits_ = num_random_inits_ | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"') | |||
| if num_random_inits_ <= 0: | |||
| raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"') | |||
| } | |||
| elif opt_name == 'randomness': | |||
| if opt_val == 'PSEUDO': | |||
| use_real_randomness_ = False | |||
| elif opt_val == 'REAL': | |||
| use_real_randomness_ = True | |||
| else: | |||
| raise Error('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') | |||
| } | |||
| elif opt_name == 'stdout': | |||
| if opt_val == '0': | |||
| print_to_stdout_ = 0 | |||
| elif opt_val == '1': | |||
| print_to_stdout_ = 1 | |||
| elif opt_val == '2': | |||
| print_to_stdout_ = 2 | |||
| else: | |||
| raise Error('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') | |||
| } | |||
| elif opt_name == 'refine': | |||
| if opt_val == 'TRUE': | |||
| refine_ = True | |||
| elif opt_val == 'FALSE': | |||
| refine_ = False | |||
| else: | |||
| raise Error('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') | |||
| } | |||
| elif opt_name == 'time-limit': | |||
| try: | |||
| time_limit_in_sec_ = std::stod(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit <convertible to double>] [...]') | |||
| } | |||
| elif opt_name == 'max-itrs': | |||
| try: | |||
| max_itrs_ = std::stoi(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]') | |||
| } | |||
| elif opt_name == 'max-itrs-without-update': | |||
| try: | |||
| max_itrs_without_update_ = std::stoi(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]') | |||
| } | |||
| elif opt_name == 'seed': | |||
| try: | |||
| seed_ = std::stoul(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]') | |||
| } | |||
| elif opt_name == 'epsilon': | |||
| try: | |||
| epsilon_ = std::stod(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]') | |||
| if epsilon_ <= 0: | |||
| raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]') | |||
| } | |||
| elif opt_name == 'inits-increase-order': | |||
| try: | |||
| num_inits_increase_order_ = std::stoul(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"') | |||
| if num_inits_increase_order_ <= 0: | |||
| raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"') | |||
| } | |||
| elif opt_name == 'init-type-increase-order': | |||
| init_type_increase_order_ = opt_val | |||
| if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': | |||
| raise Exception(std::string('Invalid argument ') + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') | |||
| } | |||
| elif opt_name == 'max-itrs-increase-order': | |||
| try: | |||
| max_itrs_increase_order_ = std::stoi(opt_val) | |||
| except: | |||
| raise Error('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]') | |||
| } | |||
| else: | |||
| std::string valid_options('[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] ') | |||
| valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] ' | |||
| valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]' | |||
| raise Error(std::string('Invalid option "') + opt_name + '". Usage: options = "' + valid_options + '"') | |||
| @@ -1,83 +0,0 @@ | |||
| #export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad | |||
| #Pour que "import script" trouve les librairies qu'a besoin GedLib | |||
| #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash | |||
| import gedlibpy.librariesImport | |||
| from gedlibpy import gedlibpy | |||
| import networkx as nx | |||
| def init() : | |||
| print("List of Edit Cost Options : ") | |||
| for i in gedlibpy.list_of_edit_cost_options : | |||
| print (i) | |||
| print("") | |||
| print("List of Method Options : ") | |||
| for j in gedlibpy.list_of_method_options : | |||
| print (j) | |||
| print("") | |||
| print("List of Init Options : ") | |||
| for k in gedlibpy.list_of_init_options : | |||
| print (k) | |||
| print("") | |||
| def test(): | |||
| gedlibpy.load_GXL_graphs('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost("CHEM_1") | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", "") | |||
| gedlibpy.init_method() | |||
| g = listID[0] | |||
| h = listID[1] | |||
| gedlibpy.run_method(g, h) | |||
| print("Node Map : ", gedlibpy.get_node_map(g,h)) | |||
| print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) | |||
| print("Assignment Matrix : ") | |||
| print(gedlibpy.get_assignment_matrix(g, h)) | |||
| print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g,h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) | |||
| def convertGraph(G): | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| return G_new | |||
| def testNxGrapĥ(): | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gedlibpy.restart_env() | |||
| for graph in Gn: | |||
| g_new = convertGraph(graph) | |||
| gedlibpy.add_nx_graph(g_new, "") | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost("CHEM_1") | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", "") | |||
| gedlibpy.init_method() | |||
| print(listID) | |||
| g = listID[0] | |||
| h = listID[1] | |||
| gedlibpy.run_method(g, h) | |||
| print("Node Map : ", gedlibpy.get_node_map(g, h)) | |||
| print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) | |||
| print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) | |||
| #test() | |||
| init() | |||
| #testNxGrapĥ() | |||
| @@ -1,648 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Oct 24 11:50:56 2019 | |||
| @author: ljia | |||
| """ | |||
| from matplotlib import pyplot as plt | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.utils import remove_edges | |||
| from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance | |||
| from gklearn.preimage.utils import normalize_distance_matrix | |||
| def test_update_costs(): | |||
| from preimage.fitDistance import update_costs | |||
| import cvxpy as cp | |||
| ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz') | |||
| nb_cost_mat = ds['nb_cost_mat'] | |||
| dis_k_vec = ds['dis_k_vec'] | |||
| n_edit_operations = ds['n_edit_operations'] | |||
| ged_vec_init = ds['ged_vec_init'] | |||
| ged_mat = ds['ged_mat'] | |||
| nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, | |||
| # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] | |||
| constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([0.0, 1.0, -1.0]).T@x == 0.0] | |||
| # constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| print(x.value) | |||
| edit_costs_new = np.concatenate((x.value, np.array([0.0]))) | |||
| residual = np.sqrt(prob.value) | |||
| def median_paper_clcpc_python_best(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| # ds = {'name': 'monoterpenoides', | |||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| # _, y_all = loadDataset(ds['dataset']) | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 6 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' | |||
| for y in y_all: | |||
| for repeat in range(repeats): | |||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=True) | |||
| total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' | |||
| + y + '.repeat' + str(repeat) + '.k10..gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||
| for ec in edit_costs: | |||
| edit_costs_output_file.write(str(ec) + ' ') | |||
| edit_costs_output_file.write('\n') | |||
| edit_costs_output_file.close() | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| # plt.imshow(norm_dis_k_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| # plt.imshow(norm_ged_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| # plt.imshow(norm_diff) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # # draw_count_bar(norm_diff) | |||
| def median_paper_clcpc_python_bash_cpp(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| # ds = {'name': 'monoterpenoides', | |||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| # _, y_all = loadDataset(ds['dataset']) | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 20 | |||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options} | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' | |||
| for y in y_all: | |||
| for repeat in range(repeats): | |||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=False) | |||
| total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| + y + '.repeat' + str(repeat) + '.gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||
| coef_dk=coef_dk) | |||
| for ec in edit_costs: | |||
| edit_costs_output_file.write(str(ec) + ' ') | |||
| edit_costs_output_file.write('\n') | |||
| edit_costs_output_file.close() | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| # coef_dk = gmfile['coef_dk'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| # plt.imshow(norm_dis_k_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| # plt.imshow(norm_ged_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| # plt.imshow(norm_diff) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # # draw_count_bar(norm_diff) | |||
| def test_cs_leq_ci_plus_cr_python_bash_cpp(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:10] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 10 | |||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options} | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=False) | |||
| total_time = np.sum(time_list) | |||
| print('\nedit_costs:', edit_costs) | |||
| print('\nresidual_list:', residual_list) | |||
| print('\nedit_cost_list:', edit_cost_list) | |||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| print('\nged matrix:', ged_mat) | |||
| print('\ntotal time:', total_time) | |||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||
| coef_dk=coef_dk) | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| ## Gn = Gn[0:10] | |||
| ## remove_edges(Gn) | |||
| # gkernel = 'untilhpathkernel' | |||
| # node_label = 'atom' | |||
| # edge_label = 'bond_type' | |||
| # itr_max = 10 | |||
| # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| # gkernel, itr_max) | |||
| # total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', | |||
| # edit_costs=edit_costs, | |||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| # coef_dk = gmfile['coef_dk'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # dis_k_sub = pairwise_substitution(dis_k_mat) | |||
| # ged_sub = pairwise_substitution(ged_mat) | |||
| # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', | |||
| # dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| plt.imshow(norm_dis_k_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| plt.imshow(norm_ged_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| plt.imshow(norm_diff) | |||
| plt.colorbar() | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| # draw_count_bar(norm_diff) | |||
| def test_anycosts(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:10] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| itr_max = 10 | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max) | |||
| total_time = np.sum(time_list) | |||
| print('\nedit_costs:', edit_costs) | |||
| print('\nresidual_list:', residual_list) | |||
| print('\nedit_cost_list:', edit_cost_list) | |||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| print('\nged matrix:', ged_mat) | |||
| print('\ntotal time:', total_time) | |||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.any_costs.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| ## nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| plt.imshow(norm_dis_k_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| plt.imshow(norm_ged_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| plt.imshow(norm_diff) | |||
| plt.colorbar() | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| # draw_count_bar(norm_diff) | |||
| def test_cs_leq_ci_plus_cr(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er | |||
| """ | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:10] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 10 | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, | |||
| fitkernel='gaussian') | |||
| total_time = np.sum(time_list) | |||
| print('\nedit_costs:', edit_costs) | |||
| print('\nresidual_list:', residual_list) | |||
| print('\nedit_cost_list:', edit_cost_list) | |||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| print('\nged matrix:', ged_mat) | |||
| print('\ntotal time:', total_time) | |||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||
| coef_dk=coef_dk) | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| ## Gn = Gn[0:10] | |||
| ## remove_edges(Gn) | |||
| # gkernel = 'untilhpathkernel' | |||
| # node_label = 'atom' | |||
| # edge_label = 'bond_type' | |||
| # itr_max = 10 | |||
| # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| # gkernel, itr_max) | |||
| # total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', | |||
| # edit_costs=edit_costs, | |||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| # coef_dk = gmfile['coef_dk'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # dis_k_sub = pairwise_substitution(dis_k_mat) | |||
| # ged_sub = pairwise_substitution(ged_mat) | |||
| # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', | |||
| # dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| plt.imshow(norm_dis_k_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| plt.imshow(norm_ged_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| plt.imshow(norm_diff) | |||
| plt.colorbar() | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| # draw_count_bar(norm_diff) | |||
| def test_unfitted(): | |||
| """unfitted. | |||
| """ | |||
| from fitDistance import compute_geds | |||
| from utils import kernel_distance_matrix | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:10] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| ## Gn = Gn[0:10] | |||
| ## remove_edges(Gn) | |||
| # gkernel = 'marginalizedkernel' | |||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | |||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], | |||
| [0, 1, 2, 3, 4, 5], parallel=True) | |||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| print('\nged matrix:', ged_mat) | |||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, | |||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||
| # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| plt.imshow(norm_dis_k_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| plt.imshow(norm_ged_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| plt.imshow(norm_diff) | |||
| plt.colorbar() | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| draw_count_bar(norm_diff) | |||
| def pairwise_substitution_consistence(mat1, mat2): | |||
| """ | |||
| """ | |||
| nb_consistent = 0 | |||
| nb_inconsistent = 0 | |||
| # the matrix is considered symmetric. | |||
| upper_tri1 = mat1[np.triu_indices_from(mat1)] | |||
| upper_tri2 = mat2[np.tril_indices_from(mat2)] | |||
| for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout): | |||
| for j in range(i, len(upper_tri1)): | |||
| if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]): | |||
| nb_consistent += 1 | |||
| else: | |||
| nb_inconsistent += 1 | |||
| return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent) | |||
| def pairwise_substitution(mat): | |||
| # the matrix is considered symmetric. | |||
| upper_tri = mat[np.triu_indices_from(mat)] | |||
| sub_list = [] | |||
| for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout): | |||
| for j in range(i, len(upper_tri)): | |||
| sub_list.append(upper_tri[i] - upper_tri[j]) | |||
| return sub_list | |||
| def draw_count_bar(norm_diff): | |||
| import pandas | |||
| from collections import Counter, OrderedDict | |||
| norm_diff_cnt = norm_diff.flatten() | |||
| norm_diff_cnt = norm_diff_cnt * 10 | |||
| norm_diff_cnt = np.floor(norm_diff_cnt) | |||
| norm_diff_cnt = Counter(norm_diff_cnt) | |||
| norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items())) | |||
| df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index') | |||
| df.plot(kind='bar') | |||
| if __name__ == '__main__': | |||
| # test_anycosts() | |||
| # test_cs_leq_ci_plus_cr() | |||
| # test_unfitted() | |||
| # test_cs_leq_ci_plus_cr_python_bash_cpp() | |||
| # median_paper_clcpc_python_bash_cpp() | |||
| # median_paper_clcpc_python_best() | |||
| # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | |||
| # xx = pairwise_substitution(x) | |||
| test_update_costs() | |||
| @@ -1,520 +0,0 @@ | |||
| #export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad | |||
| #Pour que "import script" trouve les librairies qu'a besoin GedLib | |||
| #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash | |||
| #import gedlibpy_linlin.librariesImport | |||
| #from gedlibpy_linlin import gedlibpy | |||
| from libs import * | |||
| import networkx as nx | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| import sys | |||
| def test_NON_SYMBOLIC_cost(): | |||
| """Test edit cost LETTER2. | |||
| """ | |||
| from gklearn.preimage.ged import GED, get_nb_edit_operations_nonsymbolic, get_nb_edit_operations_letter | |||
| from gklearn.preimage.test_k_closest_graphs import reform_attributes | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| dataset = '../../datasets/Letter-high/Letter-high_A.txt' | |||
| Gn, y_all = loadDataset(dataset) | |||
| g1 = Gn[200] | |||
| g2 = Gn[1780] | |||
| reform_attributes(g1) | |||
| reform_attributes(g2) | |||
| c_vi = 0.675 | |||
| c_vr = 0.675 | |||
| c_vs = 0.75 | |||
| c_ei = 0.425 | |||
| c_er = 0.425 | |||
| c_es = 0 | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', | |||
| cost='NON_SYMBOLIC', method='IPFP', edit_cost_constant=edit_cost_constant, | |||
| algo_options='', stabilizer=None) | |||
| n_vi, n_vr, sod_vs, n_ei, n_er, sod_es = get_nb_edit_operations_nonsymbolic(g1, g2, | |||
| pi_forward, pi_backward) | |||
| print('# of operations:', n_vi, n_vr, sod_vs, n_ei, n_er, sod_es) | |||
| print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er, c_es) | |||
| cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \ | |||
| + c_ei * n_ei + c_er * n_er + c_es * sod_es | |||
| print('dis (cost computed by GED):', dis) | |||
| print('cost computed by # of operations and edit cost constants:', cost_computed) | |||
| def test_LETTER2_cost(): | |||
| """Test edit cost LETTER2. | |||
| """ | |||
| from gklearn.preimage.ged import GED, get_nb_edit_operations_letter | |||
| from gklearn.preimage.test_k_closest_graphs import reform_attributes | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| g1 = Gn[200] | |||
| g2 = Gn[1780] | |||
| reform_attributes(g1) | |||
| reform_attributes(g2) | |||
| c_vi = 0.675 | |||
| c_vr = 0.675 | |||
| c_vs = 0.75 | |||
| c_ei = 0.425 | |||
| c_er = 0.425 | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er] | |||
| dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy', | |||
| cost='LETTER2', method='IPFP', edit_cost_constant=edit_cost_constant, | |||
| algo_options='', stabilizer=None) | |||
| n_vi, n_vr, n_vs, sod_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2, | |||
| pi_forward, pi_backward) | |||
| print('# of operations:', n_vi, n_vr, n_vs, sod_vs, n_ei, n_er) | |||
| print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er) | |||
| cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \ | |||
| + c_ei * n_ei + c_er * n_er | |||
| print('dis (cost computed by GED):', dis) | |||
| print('cost computed by # of operations and edit cost constants:', cost_computed) | |||
| def test_get_nb_edit_operations_letter(): | |||
| """Test whether function preimage.ged.get_nb_edit_operations_letter returns | |||
| correct numbers of edit operations. The distance/cost computed by GED | |||
| should be the same as the cost computed by number of operations and edit | |||
| cost constants. | |||
| """ | |||
| from gklearn.preimage.ged import GED, get_nb_edit_operations_letter | |||
| from gklearn.preimage.test_k_closest_graphs import reform_attributes | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| g1 = Gn[200] | |||
| g2 = Gn[1780] | |||
| reform_attributes(g1) | |||
| reform_attributes(g2) | |||
| c_vir = 0.9 | |||
| c_eir = 1.7 | |||
| alpha = 0.75 | |||
| edit_cost_constant = [c_vir, c_eir, alpha] | |||
| dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy', | |||
| cost='LETTER', method='IPFP', edit_cost_constant=edit_cost_constant, | |||
| algo_options='', stabilizer=None) | |||
| n_vi, n_vr, n_vs, c_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2, | |||
| pi_forward, pi_backward) | |||
| print('# of operations and costs:', n_vi, n_vr, n_vs, c_vs, n_ei, n_er) | |||
| print('c_vir, c_eir, alpha:', c_vir, c_eir, alpha) | |||
| cost_computed = alpha * c_vir * (n_vi + n_vr) \ | |||
| + alpha * c_vs \ | |||
| + (1 - alpha) * c_eir * (n_ei + n_er) | |||
| print('dis (cost computed by GED):', dis) | |||
| print('cost computed by # of operations and edit cost constants:', cost_computed) | |||
| def test_get_nb_edit_operations(): | |||
| """Test whether function preimage.ged.get_nb_edit_operations returns correct | |||
| numbers of edit operations. The distance/cost computed by GED should be the | |||
| same as the cost computed by number of operations and edit cost constants. | |||
| """ | |||
| from gklearn.preimage.ged import GED, get_nb_edit_operations | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| import os | |||
| ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| g1 = Gn[20] | |||
| g2 = Gn[108] | |||
| c_vi = 3 | |||
| c_vr = 3 | |||
| c_vs = 1 | |||
| c_ei = 3 | |||
| c_er = 3 | |||
| c_es = 1 | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| dis, pi_forward, pi_backward = GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', | |||
| cost='CONSTANT', method='IPFP', edit_cost_constant=edit_cost_constant, | |||
| algo_options='', stabilizer=None) | |||
| n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(g1, g2, | |||
| pi_forward, pi_backward) | |||
| print('# of operations and costs:', n_vi, n_vr, n_vs, n_ei, n_er, n_es) | |||
| print('edit costs:', c_vi, c_vr, c_vs, c_ei, c_er, c_es) | |||
| cost_computed = n_vi * c_vi + n_vr * c_vr + n_vs * c_vs \ | |||
| + n_ei * c_ei + n_er * c_er + n_es * c_es | |||
| print('dis (cost computed by GED):', dis) | |||
| print('cost computed by # of operations and edit cost constants:', cost_computed) | |||
| def test_ged_python_bash_cpp(): | |||
| """Test ged computation with python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.ged import GED | |||
| data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' | |||
| # collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' | |||
| collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml' | |||
| graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' | |||
| Gn, y = loadDataset(collection_file, extra_params=graph_dir) | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| for repeat in range(0, 3): | |||
| # Generate the result file. | |||
| ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_bash_' + str(repeat) + '_init40.3_20.txt' | |||
| # runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt' | |||
| ged_file = open(ged_filename, 'a') | |||
| # runtime_file = open(runtime_filename, 'a') | |||
| ged_mat = np.empty((len(Gn), len(Gn))) | |||
| # runtime_mat = np.empty((len(Gn), len(Gn))) | |||
| for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||
| for j in range(len(Gn)): | |||
| print(i, j) | |||
| g1 = Gn[i] | |||
| g2 = Gn[j] | |||
| upper_bound, _, _ = GED(g1, g2, lib='gedlib-bash', cost='CONSTANT', | |||
| method='IPFP', | |||
| edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0], | |||
| algo_options=algo_options) | |||
| # runtime = gedlibpy.get_runtime(g1, g2) | |||
| ged_mat[i][j] = upper_bound | |||
| # runtime_mat[i][j] = runtime | |||
| # Write to files. | |||
| ged_file.write(str(int(upper_bound)) + ' ') | |||
| # runtime_file.write(str(runtime) + ' ') | |||
| ged_file.write('\n') | |||
| # runtime_file.write('\n') | |||
| ged_file.close() | |||
| # runtime_file.close() | |||
| print('ged_mat') | |||
| print(ged_mat) | |||
| # print('runtime_mat:') | |||
| # print(runtime_mat) | |||
| return | |||
| def test_ged_best_settings_updated(): | |||
| """Test ged computation with best settings the same as in the C++ code (with updated library). | |||
| """ | |||
| data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' | |||
| collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' | |||
| # collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml' | |||
| graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| for repeat in range(0, 3): | |||
| # Generate the result file. | |||
| ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_updated_' + str(repeat) + '_init40.txt' | |||
| runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_updated_' + str(repeat) + '_init40.txt' | |||
| gedlibpy.restart_env() | |||
| gedlibpy.load_GXL_graphs(graph_dir, collection_file) | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", algo_options) | |||
| gedlibpy.init_method() | |||
| ged_mat = np.empty((len(listID), len(listID))) | |||
| runtime_mat = np.empty((len(listID), len(listID))) | |||
| for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): | |||
| ged_file = open(ged_filename, 'a') | |||
| runtime_file = open(runtime_filename, 'a') | |||
| for j in range(len(listID)): | |||
| g1 = listID[i] | |||
| g2 = listID[j] | |||
| gedlibpy.run_method(g1, g2) | |||
| upper_bound = gedlibpy.get_upper_bound(g1, g2) | |||
| runtime = gedlibpy.get_runtime(g1, g2) | |||
| ged_mat[i][j] = upper_bound | |||
| runtime_mat[i][j] = runtime | |||
| # Write to files. | |||
| ged_file.write(str(int(upper_bound)) + ' ') | |||
| runtime_file.write(str(runtime) + ' ') | |||
| ged_file.write('\n') | |||
| runtime_file.write('\n') | |||
| ged_file.close() | |||
| runtime_file.close() | |||
| print('ged_mat') | |||
| print(ged_mat) | |||
| print('runtime_mat:') | |||
| print(runtime_mat) | |||
| return | |||
| def test_ged_best_settings(): | |||
| """Test ged computation with best settings the same as in the C++ code. | |||
| """ | |||
| data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' | |||
| collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' | |||
| graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' | |||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| for repeat in range(0, 3): | |||
| # Generate the result file. | |||
| ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_best_settings_' + str(repeat) + '.txt' | |||
| runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_best_settings_' + str(repeat) + '.txt' | |||
| ged_file = open(ged_filename, 'a') | |||
| runtime_file = open(runtime_filename, 'a') | |||
| gedlibpy.restart_env() | |||
| gedlibpy.load_GXL_graphs(graph_dir, collection_file) | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", algo_options) | |||
| gedlibpy.init_method() | |||
| ged_mat = np.empty((len(listID), len(listID))) | |||
| runtime_mat = np.empty((len(listID), len(listID))) | |||
| for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): | |||
| for j in range(len(listID)): | |||
| g1 = listID[i] | |||
| g2 = listID[j] | |||
| gedlibpy.run_method(g1, g2) | |||
| upper_bound = gedlibpy.get_upper_bound(g1, g2) | |||
| runtime = gedlibpy.get_runtime(g1, g2) | |||
| ged_mat[i][j] = upper_bound | |||
| runtime_mat[i][j] = runtime | |||
| # Write to files. | |||
| ged_file.write(str(int(upper_bound)) + ' ') | |||
| runtime_file.write(str(runtime) + ' ') | |||
| ged_file.write('\n') | |||
| runtime_file.write('\n') | |||
| ged_file.close() | |||
| runtime_file.close() | |||
| print('ged_mat') | |||
| print(ged_mat) | |||
| print('runtime_mat:') | |||
| print(runtime_mat) | |||
| return | |||
| def test_ged_default(): | |||
| """Test ged computation with default settings. | |||
| """ | |||
| data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' | |||
| collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' | |||
| graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' | |||
| for repeat in range(3): | |||
| # Generate the result file. | |||
| ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_default_' + str(repeat) + '.txt' | |||
| runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_default_' + str(repeat) + '.txt' | |||
| ged_file = open(ged_filename, 'a') | |||
| runtime_file = open(runtime_filename, 'a') | |||
| gedlibpy.restart_env() | |||
| gedlibpy.load_GXL_graphs(graph_dir, collection_file) | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", "") | |||
| gedlibpy.init_method() | |||
| ged_mat = np.empty((len(listID), len(listID))) | |||
| runtime_mat = np.empty((len(listID), len(listID))) | |||
| for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): | |||
| for j in range(len(listID)): | |||
| g1 = listID[i] | |||
| g2 = listID[j] | |||
| gedlibpy.run_method(g1, g2) | |||
| upper_bound = gedlibpy.get_upper_bound(g1, g2) | |||
| runtime = gedlibpy.get_runtime(g1, g2) | |||
| ged_mat[i][j] = upper_bound | |||
| runtime_mat[i][j] = runtime | |||
| # Write to files. | |||
| ged_file.write(str(int(upper_bound)) + ' ') | |||
| runtime_file.write(str(runtime) + ' ') | |||
| ged_file.write('\n') | |||
| runtime_file.write('\n') | |||
| ged_file.close() | |||
| runtime_file.close() | |||
| print('ged_mat') | |||
| print(ged_mat) | |||
| print('runtime_mat:') | |||
| print(runtime_mat) | |||
| return | |||
| def test_ged_min(): | |||
| """Test ged computation with the "min" stabilizer. | |||
| """ | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.ged import GED | |||
| data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' | |||
| collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' | |||
| graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' | |||
| Gn, y = loadDataset(collection_file, extra_params=graph_dir) | |||
| # algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| for repeat in range(0, 3): | |||
| # Generate the result file. | |||
| ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_min_' + str(repeat) + '.txt' | |||
| # runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt' | |||
| ged_file = open(ged_filename, 'a') | |||
| # runtime_file = open(runtime_filename, 'a') | |||
| ged_mat = np.empty((len(Gn), len(Gn))) | |||
| # runtime_mat = np.empty((len(Gn), len(Gn))) | |||
| for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||
| for j in range(len(Gn)): | |||
| g1 = Gn[i] | |||
| g2 = Gn[j] | |||
| upper_bound, _, _ = GED(g1, g2, lib='gedlibpy', cost='CONSTANT', | |||
| method='IPFP', | |||
| edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0], | |||
| stabilizer='min', repeat=10) | |||
| # runtime = gedlibpy.get_runtime(g1, g2) | |||
| ged_mat[i][j] = upper_bound | |||
| # runtime_mat[i][j] = runtime | |||
| # Write to files. | |||
| ged_file.write(str(int(upper_bound)) + ' ') | |||
| # runtime_file.write(str(runtime) + ' ') | |||
| ged_file.write('\n') | |||
| # runtime_file.write('\n') | |||
| ged_file.close() | |||
| # runtime_file.close() | |||
| print('ged_mat') | |||
| print(ged_mat) | |||
| # print('runtime_mat:') | |||
| # print(runtime_mat) | |||
| return | |||
| def init() : | |||
| print("List of Edit Cost Options : ") | |||
| for i in gedlibpy.list_of_edit_cost_options : | |||
| print (i) | |||
| print("") | |||
| print("List of Method Options : ") | |||
| for j in gedlibpy.list_of_method_options : | |||
| print (j) | |||
| print("") | |||
| print("List of Init Options : ") | |||
| for k in gedlibpy.list_of_init_options : | |||
| print (k) | |||
| print("") | |||
| def convertGraph(G): | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| return G_new | |||
| def testNxGrapĥ(): | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gedlibpy.restart_env() | |||
| for graph in Gn: | |||
| g_new = convertGraph(graph) | |||
| gedlibpy.add_nx_graph(g_new, "") | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost("CHEM_1") | |||
| gedlibpy.init() | |||
| gedlibpy.set_method("IPFP", "") | |||
| gedlibpy.init_method() | |||
| print(listID) | |||
| g = listID[0] | |||
| h = listID[1] | |||
| gedlibpy.run_method(g, h) | |||
| print("Node Map : ", gedlibpy.get_node_map(g, h)) | |||
| print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) | |||
| print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) | |||
| if __name__ == '__main__': | |||
| # test_ged_default() | |||
| # test_ged_min() | |||
| # test_ged_best_settings() | |||
| # test_ged_best_settings_updated() | |||
| # test_ged_python_bash_cpp() | |||
| # test_get_nb_edit_operations() | |||
| # test_get_nb_edit_operations_letter() | |||
| # test_LETTER2_cost() | |||
| test_NON_SYMBOLIC_cost() | |||
| #init() | |||
| #testNxGrapĥ() | |||
| @@ -1,964 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Sep 5 15:59:00 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| import random | |||
| #from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| #from gklearn.utils.logger2file import * | |||
| from gklearn.preimage.iam import iam_upgraded | |||
| from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | |||
| #from gklearn.preimage.ged import ged_median | |||
| def test_iam_monoterpenoides_with_init40(): | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # unfitted edit costs. | |||
| c_vi = 3 | |||
| c_vr = 3 | |||
| c_vs = 1 | |||
| c_ei = 3 | |||
| c_er = 3 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.0001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| ged_cost = 'CONSTANT' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| ged_stabilizer = None | |||
| # ged_repeat = 50 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'algo_options': algo_options, | |||
| 'stabilizer': ged_stabilizer} | |||
| collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| # classify graphs according to classes. | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| dis_ks_set_median_list = [] | |||
| sod_gs_list = [] | |||
| g_best = [] | |||
| sod_set_median_list = [] | |||
| sod_list_list = [] | |||
| for y in y_all: | |||
| print('\n-------------------------------------------------------') | |||
| print('class of y:', y) | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| dis_ks_set_median_list.append([]) | |||
| sod_gs_list.append([]) | |||
| g_best.append([]) | |||
| sod_set_median_list.append([]) | |||
| for repeat in range(repeats): | |||
| # load median set. | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| time0 = time.time() | |||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||
| = iam_upgraded(Gn_median, | |||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, | |||
| connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list[-1].append(time_total) | |||
| g_best[-1].append(G_gen_median_list[0]) | |||
| sod_set_median_list[-1].append(sod_set_median) | |||
| print('\nsmallest sod of the set median:', sod_set_median) | |||
| sod_gs_list[-1].append(sod_gen_median) | |||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||
| sod_list_list.append(sod_list) | |||
| # # show the best graph and save it to file. | |||
| # print('one of the possible corresponding pre-images is') | |||
| # nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||
| # with_labels=True) | |||
| ## plt.show() | |||
| # # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| ## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||
| ## '_repeat' + str(repeat) + '_' + str(time.time()) + | |||
| ## '.png', format="PNG") | |||
| # plt.clf() | |||
| # # print(G_gen_median_list[0].nodes(data=True)) | |||
| # # print(G_gen_median_list[0].edges(data=True)) | |||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||
| # print('\ndistance in kernel space of set median for this class:', | |||
| # dis_ks_set_median_list[-1]) | |||
| # print('\nsmallest distances in kernel space for this class:', | |||
| # dis_ks_min_list[-1]) | |||
| print('\ntimes for this class:', time_list[-1]) | |||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||
| # dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||
| # dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||
| time_list[-1] = np.mean(time_list[-1]) | |||
| print() | |||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||
| # print('\ndistances in kernel space of set median for each class:', | |||
| # dis_ks_set_median_list) | |||
| # print('\nmean smallest distances in kernel space for each class:', | |||
| # dis_ks_min_list) | |||
| print('\nmean times for each class:', time_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||
| # print('\nmean distances in kernel space of set median of all:', | |||
| # np.mean(dis_ks_set_median_list)) | |||
| # print('\nmean smallest distances in kernel space of all:', | |||
| # np.mean(dis_ks_min_list)) | |||
| print('\nmean times of all:', np.mean(time_list)) | |||
| def test_iam_monoterpenoides(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # parameters for GED function from the IAM paper. | |||
| # fitted edit costs (Gaussian). | |||
| c_vi = 0.03620133402089074 | |||
| c_vr = 0.0417574590207099 | |||
| c_vs = 0.009992282328587499 | |||
| c_ei = 0.08293120042342755 | |||
| c_er = 0.09512220476358019 | |||
| c_es = 0.09222529696841467 | |||
| # # fitted edit costs (linear combinations). | |||
| # c_vi = 0.1749684054238749 | |||
| # c_vr = 0.0734054228711457 | |||
| # c_vs = 0.05017781726016715 | |||
| # c_ei = 0.1869431164806936 | |||
| # c_er = 0.32055856948274 | |||
| # c_es = 0.2569469379247611 | |||
| # # unfitted edit costs. | |||
| # c_vi = 3 | |||
| # c_vr = 3 | |||
| # c_vs = 1 | |||
| # c_ei = 3 | |||
| # c_er = 3 | |||
| # c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| ged_cost = 'CONSTANT' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| # edit_cost_constant = [] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # classify graphs according to letters. | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| dis_ks_set_median_list = [] | |||
| sod_gs_list = [] | |||
| g_best = [] | |||
| sod_set_median_list = [] | |||
| sod_list_list = [] | |||
| idx_dict = get_same_item_indices(y_all) | |||
| for y_class in idx_dict: | |||
| print('\n-------------------------------------------------------') | |||
| print('class of y:', y_class) | |||
| Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| dis_ks_set_median_list.append([]) | |||
| sod_gs_list.append([]) | |||
| g_best.append([]) | |||
| sod_set_median_list.append([]) | |||
| for repeat in range(50): | |||
| idx_rdm = random.sample(range(len(Gn_class)), 10) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||
| time0 = time.time() | |||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||
| = iam_upgraded(Gn_median, | |||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list[-1].append(time_total) | |||
| g_best[-1].append(G_gen_median_list[0]) | |||
| sod_set_median_list[-1].append(sod_set_median) | |||
| print('\nsmallest sod of the set median:', sod_set_median) | |||
| sod_gs_list[-1].append(sod_gen_median) | |||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||
| sod_list_list.append(sod_list) | |||
| # show the best graph and save it to file. | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||
| with_labels=True) | |||
| # plt.show() | |||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| # plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||
| # '_repeat' + str(repeat) + '_' + str(time.time()) + | |||
| # '.png', format="PNG") | |||
| plt.clf() | |||
| # print(G_gen_median_list[0].nodes(data=True)) | |||
| # print(G_gen_median_list[0].edges(data=True)) | |||
| # compute distance between \psi and the set median graph. | |||
| knew_set_median = compute_kernel(G_set_median_list + Gn_median, | |||
| gkernel, node_label, edge_label, False) | |||
| dhat_new_set_median_list = [] | |||
| for idx, g_tmp in enumerate(G_set_median_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), | |||
| len(G_set_median_list) + len(Gn_median) + 1), | |||
| alpha_range, knew_set_median, withterm3=False)) | |||
| print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) | |||
| dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||
| edge_label, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||
| alpha_range, knew, withterm3=False)) | |||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||
| print('\ndistance in kernel space of set median for this class:', | |||
| dis_ks_set_median_list[-1]) | |||
| print('\nsmallest distances in kernel space for this class:', | |||
| dis_ks_min_list[-1]) | |||
| print('\ntimes for this class:', time_list[-1]) | |||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||
| dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||
| time_list[-1] = np.mean(time_list[-1]) | |||
| print() | |||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||
| print('\ndistances in kernel space of set median for each class:', | |||
| dis_ks_set_median_list) | |||
| print('\nmean smallest distances in kernel space for each class:', | |||
| dis_ks_min_list) | |||
| print('\nmean times for each class:', time_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||
| print('\nmean distances in kernel space of set median of all:', | |||
| np.mean(dis_ks_set_median_list)) | |||
| print('\nmean smallest distances in kernel space of all:', | |||
| np.mean(dis_ks_min_list)) | |||
| print('\nmean times of all:', np.mean(time_list)) | |||
| nb_better_sods = 0 | |||
| nb_worse_sods = 0 | |||
| nb_same_sods = 0 | |||
| for sods in sod_list_list: | |||
| if sods[0] > sods[-1]: | |||
| nb_better_sods += 1 | |||
| elif sods[0] < sods[-1]: | |||
| nb_worse_sods += 1 | |||
| else: | |||
| nb_same_sods += 1 | |||
| print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), | |||
| 'are getting better,', str(nb_worse_sods), 'are getting worse,', | |||
| str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), | |||
| 'sods are improved.') | |||
| def test_iam_mutag(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # parameters for GED function from the IAM paper. | |||
| # fitted edit costs. | |||
| c_vi = 0.03523843108436513 | |||
| c_vr = 0.03347339739350128 | |||
| c_vs = 0.06871290673612238 | |||
| c_ei = 0.08591999846720685 | |||
| c_er = 0.07962086440894103 | |||
| c_es = 0.08596855855478233 | |||
| # unfitted edit costs. | |||
| # c_vi = 3 | |||
| # c_vr = 3 | |||
| # c_vs = 1 | |||
| # c_ei = 3 | |||
| # c_er = 3 | |||
| # c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| ged_cost = 'CONSTANT' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| # edit_cost_constant = [] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # classify graphs according to letters. | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| dis_ks_set_median_list = [] | |||
| sod_gs_list = [] | |||
| g_best = [] | |||
| sod_set_median_list = [] | |||
| sod_list_list = [] | |||
| idx_dict = get_same_item_indices(y_all) | |||
| for y_class in idx_dict: | |||
| print('\n-------------------------------------------------------') | |||
| print('class of y:', y_class) | |||
| Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| dis_ks_set_median_list.append([]) | |||
| sod_gs_list.append([]) | |||
| g_best.append([]) | |||
| sod_set_median_list.append([]) | |||
| for repeat in range(50): | |||
| idx_rdm = random.sample(range(len(Gn_class)), 10) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||
| time0 = time.time() | |||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||
| = iam_upgraded(Gn_median, | |||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list[-1].append(time_total) | |||
| g_best[-1].append(G_gen_median_list[0]) | |||
| sod_set_median_list[-1].append(sod_set_median) | |||
| print('\nsmallest sod of the set median:', sod_set_median) | |||
| sod_gs_list[-1].append(sod_gen_median) | |||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||
| sod_list_list.append(sod_list) | |||
| # show the best graph and save it to file. | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||
| with_labels=True) | |||
| # plt.show() | |||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| # plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + | |||
| # '_repeat' + str(repeat) + '_' + str(time.time()) + | |||
| # '.png', format="PNG") | |||
| plt.clf() | |||
| # print(G_gen_median_list[0].nodes(data=True)) | |||
| # print(G_gen_median_list[0].edges(data=True)) | |||
| # compute distance between \psi and the set median graph. | |||
| knew_set_median = compute_kernel(G_set_median_list + Gn_median, | |||
| gkernel, node_label, edge_label, False) | |||
| dhat_new_set_median_list = [] | |||
| for idx, g_tmp in enumerate(G_set_median_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), | |||
| len(G_set_median_list) + len(Gn_median) + 1), | |||
| alpha_range, knew_set_median, withterm3=False)) | |||
| print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) | |||
| dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||
| edge_label, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||
| alpha_range, knew, withterm3=False)) | |||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||
| print('\ndistance in kernel space of set median for this class:', | |||
| dis_ks_set_median_list[-1]) | |||
| print('\nsmallest distances in kernel space for this class:', | |||
| dis_ks_min_list[-1]) | |||
| print('\ntimes for this class:', time_list[-1]) | |||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||
| dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||
| time_list[-1] = np.mean(time_list[-1]) | |||
| print() | |||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||
| print('\ndistances in kernel space of set median for each class:', | |||
| dis_ks_set_median_list) | |||
| print('\nmean smallest distances in kernel space for each class:', | |||
| dis_ks_min_list) | |||
| print('\nmean times for each class:', time_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||
| print('\nmean distances in kernel space of set median of all:', | |||
| np.mean(dis_ks_set_median_list)) | |||
| print('\nmean smallest distances in kernel space of all:', | |||
| np.mean(dis_ks_min_list)) | |||
| print('\nmean times of all:', np.mean(time_list)) | |||
| nb_better_sods = 0 | |||
| nb_worse_sods = 0 | |||
| nb_same_sods = 0 | |||
| for sods in sod_list_list: | |||
| if sods[0] > sods[-1]: | |||
| nb_better_sods += 1 | |||
| elif sods[0] < sods[-1]: | |||
| nb_worse_sods += 1 | |||
| else: | |||
| nb_same_sods += 1 | |||
| print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), | |||
| 'are getting better,', str(nb_worse_sods), 'are getting worse,', | |||
| str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), | |||
| 'sods are improved.') | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| def test_iam_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| # # parameters for GED function | |||
| # c_vi = 0.037 | |||
| # c_vr = 0.038 | |||
| # c_vs = 0.075 | |||
| # c_ei = 0.001 | |||
| # c_er = 0.001 | |||
| # c_es = 0.0 | |||
| # ite_max_iam = 50 | |||
| # epsilon_iam = 0.001 | |||
| # removeNodes = False | |||
| # connected_iam = False | |||
| # # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| # ged_method = 'IPFP' | |||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| # ged_stabilizer = 'min' | |||
| # ged_repeat = 50 | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| # 'edit_cost_constant': edit_cost_constant, | |||
| # 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # parameters for GED function | |||
| c_vi = 4 | |||
| c_vr = 4 | |||
| c_vs = 2 | |||
| c_ei = 1 | |||
| c_er = 1 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| ged_cost = 'CHEM_1' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| nb_median_range = [len(Gn)] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| # sod_gs_min_list = [] | |||
| # nb_updated_list = [] | |||
| # nb_updated_k_list = [] | |||
| g_best = [] | |||
| for nb_median in nb_median_range: | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| Gn_candidate = [g.copy() for g in Gn] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| # gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| # km_tmp = gmfile['gm'] | |||
| # time_km = gmfile['gmtime'] | |||
| # # modify mixed gram matrix. | |||
| # km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| # for i in range(len(Gn)): | |||
| # for j in range(i, len(Gn)): | |||
| # km[i, j] = km_tmp[i, j] | |||
| # km[j, i] = km[i, j] | |||
| # for i in range(len(Gn)): | |||
| # for j, idx in enumerate(idx_rdm): | |||
| # km[i, len(Gn) + j] = km[i, idx] | |||
| # km[len(Gn) + j, i] = km[i, idx] | |||
| # for i, idx1 in enumerate(idx_rdm): | |||
| # for j, idx2 in enumerate(idx_rdm): | |||
| # km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time0 = time.time() | |||
| ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, | |||
| c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list.append(time_total) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(ghat_new_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||
| len(ghat_new_list) + len(Gn_median) + 1), | |||
| alpha_range, knew, withterm3=False)) | |||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||
| dis_ks_min_list.append(dhat_new_list[0]) | |||
| g_best.append(ghat_new_list[0]) | |||
| # show the best graph and save it to file. | |||
| # print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.show() | |||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + | |||
| '.png', format="PNG") | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| sod_gs_list.append(sod_min) | |||
| # sod_gs_min_list.append(np.min(sod_min)) | |||
| print('\nsmallest sod in graph space: ', sod_min) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| # print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||
| dis_ks_min_list) | |||
| # print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||
| # nb_updated_list) | |||
| # print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||
| # nb_updated_k_list) | |||
| print('\ntimes:', time_list) | |||
| def test_iam_letter_h(): | |||
| from median import draw_Letter_graph | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| # Gn = Gn[0:50] | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gkernel = 'structuralspkernel' | |||
| # parameters for GED function from the IAM paper. | |||
| c_vi = 3 | |||
| c_vr = 3 | |||
| c_vs = 1 | |||
| c_ei = 3 | |||
| c_er = 3 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| ged_cost = 'LETTER' | |||
| ged_method = 'IPFP' | |||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| edit_cost_constant = [] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # classify graphs according to letters. | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| g_best = [] | |||
| sod_set_median_list = [] | |||
| idx_dict = get_same_item_indices(y_all) | |||
| for letter in idx_dict: | |||
| print('\n-------------------------------------------------------') | |||
| print('letter', letter) | |||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| sod_gs_list.append([]) | |||
| g_best.append([]) | |||
| sod_set_median_list.append([]) | |||
| for repeat in range(50): | |||
| idx_rdm = random.sample(range(len(Gn_let)), 50) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn_let[idx].copy() for idx in idx_rdm] | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||
| time0 = time.time() | |||
| ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, | |||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list[-1].append(time_total) | |||
| g_best[-1].append(ghat_new_list[0]) | |||
| sod_set_median_list[-1].append(sod_set_median) | |||
| print('\nsmallest sod of the set median:', sod_set_median) | |||
| sod_gs_list[-1].append(sod_min) | |||
| print('\nsmallest sod in graph space:', sod_min) | |||
| # show the best graph and save it to file. | |||
| print('one of the possible corresponding pre-images is') | |||
| draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/') | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(ghat_new_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||
| len(ghat_new_list) + len(Gn_median) + 1), | |||
| alpha_range, knew, withterm3=False)) | |||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||
| print('\nsods of the set median for this letter:', sod_set_median_list[-1]) | |||
| print('\nsods in graph space for this letter:', sod_gs_list[-1]) | |||
| print('\nsmallest distances in kernel space for this letter:', | |||
| dis_ks_min_list[-1]) | |||
| print('\ntimes for this letter:', time_list[-1]) | |||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||
| time_list[-1] = np.mean(time_list[-1]) | |||
| print('\nmean sods of the set median for each letter:', sod_set_median_list) | |||
| print('\nmean sods in graph space for each letter:', sod_gs_list) | |||
| print('\nmean smallest distances in kernel space for each letter:', | |||
| dis_ks_min_list) | |||
| print('\nmean times for each letter:', time_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||
| print('\nmean smallest distances in kernel space of all:', | |||
| np.mean(dis_ks_min_list)) | |||
| print('\nmean times of all:', np.mean(time_list)) | |||
| def test_iam_fitdistance(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| # remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # lmbda = 0.03 # termination probalility | |||
| # # parameters for GED function | |||
| # c_vi = 0.037 | |||
| # c_vr = 0.038 | |||
| # c_vs = 0.075 | |||
| # c_ei = 0.001 | |||
| # c_er = 0.001 | |||
| # c_es = 0.0 | |||
| # ite_max_iam = 50 | |||
| # epsilon_iam = 0.001 | |||
| # removeNodes = False | |||
| # connected_iam = False | |||
| # # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| # ged_method = 'IPFP' | |||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| # ged_stabilizer = 'min' | |||
| # ged_repeat = 50 | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| # 'edit_cost_constant': edit_cost_constant, | |||
| # 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # parameters for GED function | |||
| c_vi = 4 | |||
| c_vr = 4 | |||
| c_vs = 2 | |||
| c_ei = 1 | |||
| c_er = 1 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| ged_cost = 'CHEM_1' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| nb_median_range = [10] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| dis_ks_gen_median_list = [] | |||
| sod_gs_list = [] | |||
| # sod_gs_min_list = [] | |||
| # nb_updated_list = [] | |||
| # nb_updated_k_list = [] | |||
| g_best = [] | |||
| for nb_median in nb_median_range: | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| # gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| # km_tmp = gmfile['gm'] | |||
| # time_km = gmfile['gmtime'] | |||
| # # modify mixed gram matrix. | |||
| # km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| # for i in range(len(Gn)): | |||
| # for j in range(i, len(Gn)): | |||
| # km[i, j] = km_tmp[i, j] | |||
| # km[j, i] = km[i, j] | |||
| # for i in range(len(Gn)): | |||
| # for j, idx in enumerate(idx_rdm): | |||
| # km[i, len(Gn) + j] = km[i, idx] | |||
| # km[len(Gn) + j, i] = km[i, idx] | |||
| # for i, idx1 in enumerate(idx_rdm): | |||
| # for j, idx2 in enumerate(idx_rdm): | |||
| # km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time0 = time.time() | |||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||
| = iam_upgraded(Gn_median, Gn_candidate, | |||
| c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list.append(time_total) | |||
| # compute distance between \psi and the new generated graphs. | |||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||
| edge_label, False) | |||
| dhat_new_list = [] | |||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||
| # @todo: the term3 below could use the one at the beginning of the function. | |||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||
| alpha_range, knew, withterm3=False)) | |||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||
| dis_ks_min_list.append(dhat_new_list[0]) | |||
| g_best.append(G_gen_median_list[0]) | |||
| # show the best graph and save it to file. | |||
| # print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.show() | |||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| # plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + | |||
| # '.png', format="PNG") | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| sod_gs_list.append(sod_gen_median) | |||
| # sod_gs_min_list.append(np.min(sod_gen_median)) | |||
| print('\nsmallest sod in graph space: ', sod_gen_median) | |||
| print('\nsmallest sod of set median in graph space: ', sod_set_median) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| # print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||
| dis_ks_min_list) | |||
| # print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||
| # nb_updated_list) | |||
| # print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||
| # nb_updated_k_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| if __name__ == '__main__': | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| # test_iam_median_nb() | |||
| # test_iam_letter_h() | |||
| # test_iam_monoterpenoides() | |||
| # test_iam_mutag() | |||
| # test_iam_fitdistance() | |||
| # print("test log") | |||
| test_iam_monoterpenoides_with_init40() | |||
| @@ -1,462 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Dec 16 11:53:54 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import math | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| import random | |||
| from tqdm import tqdm | |||
| from itertools import combinations, islice | |||
| import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL | |||
| #from gklearn.utils.logger2file import * | |||
| from gklearn.preimage.iam import iam_upgraded, iam_bash | |||
| from gklearn.preimage.utils import compute_kernel, dis_gstar, kernel_distance_matrix | |||
| from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance | |||
| #from gklearn.preimage.ged import ged_median | |||
| def fit_edit_cost_constants(fit_method, edit_cost_name, | |||
| edit_cost_constants=None, initial_solutions=1, | |||
| Gn_median=None, node_label=None, edge_label=None, | |||
| gkernel=None, dataset=None, init_ecc=None, | |||
| Gn=None, Kmatrix_median=None): | |||
| """fit edit cost constants. | |||
| """ | |||
| if fit_method == 'random': # random | |||
| if edit_cost_name == 'LETTER': | |||
| edit_cost_constants = random.sample(range(1, 10), 3) | |||
| edit_cost_constants = [item * 0.1 for item in edit_cost_constants] | |||
| elif edit_cost_name == 'LETTER2': | |||
| random.seed(time.time()) | |||
| edit_cost_constants = random.sample(range(1, 10), 5) | |||
| # edit_cost_constants = [item * 0.1 for item in edit_cost_constants] | |||
| elif edit_cost_name == 'NON_SYMBOLIC': | |||
| edit_cost_constants = random.sample(range(1, 10), 6) | |||
| if Gn_median[0].graph['node_attrs'] == []: | |||
| edit_cost_constants[2] = 0 | |||
| if Gn_median[0].graph['edge_attrs'] == []: | |||
| edit_cost_constants[5] = 0 | |||
| else: | |||
| edit_cost_constants = random.sample(range(1, 10), 6) | |||
| print('edit cost constants used:', edit_cost_constants) | |||
| elif fit_method == 'expert': # expert | |||
| if init_ecc is None: | |||
| if edit_cost_name == 'LETTER': | |||
| edit_cost_constants = [0.9, 1.7, 0.75] | |||
| elif edit_cost_name == 'LETTER2': | |||
| edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425] | |||
| else: | |||
| edit_cost_constants = [3, 3, 1, 3, 3, 1] | |||
| else: | |||
| edit_cost_constants = init_ecc | |||
| elif fit_method == 'k-graphs': | |||
| itr_max = 6 | |||
| if init_ecc is None: | |||
| if edit_cost_name == 'LETTER': | |||
| init_costs = [0.9, 1.7, 0.75] | |||
| elif edit_cost_name == 'LETTER2': | |||
| init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||
| elif edit_cost_name == 'NON_SYMBOLIC': | |||
| init_costs = [0, 0, 1, 1, 1, 0] | |||
| if Gn_median[0].graph['node_attrs'] == []: | |||
| init_costs[2] = 0 | |||
| if Gn_median[0].graph['edge_attrs'] == []: | |||
| init_costs[5] = 0 | |||
| else: | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| else: | |||
| init_costs = init_ecc | |||
| algo_options = '--threads 1 --initial-solutions ' \ | |||
| + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| # fit on k-graph subset | |||
| edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||
| init_costs=init_costs, dataset=dataset, Kmatrix=Kmatrix_median, | |||
| parallel=True) | |||
| elif fit_method == 'whole-dataset': | |||
| itr_max = 6 | |||
| if init_ecc is None: | |||
| if edit_cost_name == 'LETTER': | |||
| init_costs = [0.9, 1.7, 0.75] | |||
| elif edit_cost_name == 'LETTER2': | |||
| init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||
| else: | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| else: | |||
| init_costs = init_ecc | |||
| algo_options = '--threads 1 --initial-solutions ' \ | |||
| + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| # fit on all subset | |||
| edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||
| init_costs=init_costs, dataset=dataset, parallel=True) | |||
| elif fit_method == 'precomputed': | |||
| pass | |||
| return edit_cost_constants | |||
| def compute_distances_to_true_median(Gn_median, fname_sm, fname_gm, | |||
| gkernel, edit_cost_name, | |||
| Kmatrix_median=None): | |||
| # reform graphs. | |||
| set_median = loadGXL(fname_sm) | |||
| gen_median = loadGXL(fname_gm) | |||
| # print(gen_median.nodes(data=True)) | |||
| # print(gen_median.edges(data=True)) | |||
| if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC': | |||
| # dataset == 'Fingerprint': | |||
| # for g in Gn_median: | |||
| # reform_attributes(g) | |||
| reform_attributes(set_median, Gn_median[0].graph['node_attrs'], | |||
| Gn_median[0].graph['edge_attrs']) | |||
| reform_attributes(gen_median, Gn_median[0].graph['node_attrs'], | |||
| Gn_median[0].graph['edge_attrs']) | |||
| if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC': | |||
| node_label = None | |||
| edge_label = None | |||
| else: | |||
| node_label = 'chem' | |||
| edge_label = 'valence' | |||
| # compute Gram matrix for median set. | |||
| if Kmatrix_median is None: | |||
| Kmatrix_median = compute_kernel(Gn_median, gkernel, node_label, edge_label, False) | |||
| # compute distance in kernel space for set median. | |||
| kernel_sm = [] | |||
| for G_median in Gn_median: | |||
| km_tmp = compute_kernel([set_median, G_median], gkernel, node_label, edge_label, False) | |||
| kernel_sm.append(km_tmp[0, 1]) | |||
| Kmatrix_sm = np.concatenate((np.array([kernel_sm]), np.copy(Kmatrix_median)), axis=0) | |||
| Kmatrix_sm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_sm]).T, Kmatrix_sm), axis=1) | |||
| # Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | |||
| # node_label, edge_label, False) | |||
| dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | |||
| # print(gen_median.nodes(data=True)) | |||
| # print(gen_median.edges(data=True)) | |||
| # print(set_median.nodes(data=True)) | |||
| # print(set_median.edges(data=True)) | |||
| # compute distance in kernel space for generalized median. | |||
| kernel_gm = [] | |||
| for G_median in Gn_median: | |||
| km_tmp = compute_kernel([gen_median, G_median], gkernel, node_label, edge_label, False) | |||
| kernel_gm.append(km_tmp[0, 1]) | |||
| Kmatrix_gm = np.concatenate((np.array([kernel_gm]), np.copy(Kmatrix_median)), axis=0) | |||
| Kmatrix_gm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_gm]).T, Kmatrix_gm), axis=1) | |||
| # Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | |||
| # node_label, edge_label, False) | |||
| dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | |||
| # compute distance in kernel space for each graph in median set. | |||
| dis_k_gi = [] | |||
| for idx in range(len(Gn_median)): | |||
| dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), | |||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) | |||
| print('dis_k_sm:', dis_k_sm) | |||
| print('dis_k_gm:', dis_k_gm) | |||
| print('dis_k_gi:', dis_k_gi) | |||
| idx_dis_k_gi_min = np.argmin(dis_k_gi) | |||
| dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] | |||
| print('min dis_k_gi:', dis_k_gi_min) | |||
| return dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min | |||
| def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | |||
| graph_dir=None, initial_solutions=1, | |||
| edit_cost_constants=None, group_min=None, | |||
| dataset=None, edit_cost_name=None, init_ecc=None, | |||
| Kmatrix=None, parallel=True): | |||
| # dataset = dataset.lower() | |||
| # # compute distances in kernel space. | |||
| # dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| # Kmatrix=None, gkernel=gkernel) | |||
| # # ged. | |||
| # gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] | |||
| # # choose k closest graphs | |||
| # time0 = time.time() | |||
| # sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) | |||
| # time_spent = time.time() - time0 | |||
| # print('closest graphs:', sod_ks_min, group_min) | |||
| # print('time spent:', time_spent) | |||
| # group_min = (12, 13, 22, 29) # closest w.r.t path kernel | |||
| # group_min = (77, 85, 160, 171) # closest w.r.t ged | |||
| # group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | |||
| Gn_median = [Gn[g].copy() for g in group_min] | |||
| if Kmatrix is not None: | |||
| Kmatrix_median = np.copy(Kmatrix[group_min,:]) | |||
| Kmatrix_median = Kmatrix_median[:,group_min] | |||
| else: | |||
| Kmatrix_median = None | |||
| # 1. fit edit cost constants. | |||
| time0 = time.time() | |||
| edit_cost_constants = fit_edit_cost_constants(fit_method, edit_cost_name, | |||
| edit_cost_constants=edit_cost_constants, initial_solutions=initial_solutions, | |||
| Gn_median=Gn_median, node_label=node_label, edge_label=edge_label, | |||
| gkernel=gkernel, dataset=dataset, init_ecc=init_ecc, | |||
| Gn=Gn, Kmatrix_median=Kmatrix_median) | |||
| time_fitting = time.time() - time0 | |||
| # 2. compute set median and gen median using IAM (C++ through bash). | |||
| print('\nstart computing set median and gen median using IAM (C++ through bash)...\n') | |||
| group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||
| time0 = time.time() | |||
| sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constants, | |||
| cost=edit_cost_name, initial_solutions=initial_solutions, | |||
| graph_dir=graph_dir, dataset=dataset) | |||
| time_generating = time.time() - time0 | |||
| print('\nmedians computed.\n') | |||
| # 3. compute distances to real median. | |||
| print('\nstart computing distances to true median....\n') | |||
| Gn_median = [Gn[g].copy() for g in group_min] | |||
| dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = \ | |||
| compute_distances_to_true_median(Gn_median, fname_sm, fname_gm, | |||
| gkernel, edit_cost_name, | |||
| Kmatrix_median=Kmatrix_median) | |||
| idx_dis_k_gi_min = group_min[idx_dis_k_gi_min] | |||
| print('index min dis_k_gi:', idx_dis_k_gi_min) | |||
| print('sod_sm:', sod_sm) | |||
| print('sod_gm:', sod_gm) | |||
| # collect return values. | |||
| return (sod_sm, sod_gm), \ | |||
| (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | |||
| (time_fitting, time_generating) | |||
| def reform_attributes(G, na_names=[], ea_names=[]): | |||
| if not na_names == []: | |||
| for node in G.nodes: | |||
| G.nodes[node]['attributes'] = [G.node[node][a_name] for a_name in na_names] | |||
| if not ea_names == []: | |||
| for edge in G.edges: | |||
| G.edges[edge]['attributes'] = [G.edge[edge][a_name] for a_name in ea_names] | |||
| def get_closest_k_graphs(dis_mat, k, parallel): | |||
| k_graph_groups = combinations(range(0, len(dis_mat)), k) | |||
| sod_ks_min = np.inf | |||
| if parallel: | |||
| len_combination = get_combination_length(len(dis_mat), k) | |||
| len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) | |||
| # pos_cur = 0 | |||
| graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) | |||
| for graph_groups_cur in graph_groups_slices: | |||
| # while True: | |||
| # graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) | |||
| graph_groups_cur_list = list(graph_groups_cur) | |||
| print('current position:', graph_groups_cur_list[0]) | |||
| len_itr_cur = len(graph_groups_cur_list) | |||
| # if len_itr_cur < len_itr_max: | |||
| # break | |||
| itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) | |||
| sod_k_list = np.empty(len_itr_cur) | |||
| graphs_list = [None] * len_itr_cur | |||
| n_jobs = multiprocessing.cpu_count() | |||
| chunksize = int(len_itr_max / n_jobs + 1) | |||
| n_jobs = multiprocessing.cpu_count() | |||
| def init_worker(dis_mat_toshare): | |||
| global G_dis_mat | |||
| G_dis_mat = dis_mat_toshare | |||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) | |||
| # iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, | |||
| # itr, chunksize), | |||
| # desc='Choosing k closest graphs', file=sys.stdout) | |||
| iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) | |||
| for graphs, i, sod_ks in iterator: | |||
| sod_k_list[i] = sod_ks | |||
| graphs_list[i] = graphs | |||
| pool.close() | |||
| pool.join() | |||
| arg_min = np.argmin(sod_k_list) | |||
| sod_ks_cur = sod_k_list[arg_min] | |||
| group_cur = graphs_list[arg_min] | |||
| if sod_ks_cur < sod_ks_min: | |||
| sod_ks_min = sod_ks_cur | |||
| group_min = group_cur | |||
| print('get closer graphs:', sod_ks_min, group_min) | |||
| else: | |||
| for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): | |||
| # if items[0] != itmp: | |||
| # itmp = items[0] | |||
| # print(items) | |||
| k_graph_pairs = combinations(items, 2) | |||
| sod_ks = 0 | |||
| for i1, i2 in k_graph_pairs: | |||
| sod_ks += dis_mat[i1, i2] | |||
| if sod_ks < sod_ks_min: | |||
| sod_ks_min = sod_ks | |||
| group_min = items | |||
| print('get closer graphs:', sod_ks_min, group_min) | |||
| return sod_ks_min, group_min | |||
| def _get_closest_k_graphs_parallel(itr): | |||
| k_graph_pairs = combinations(itr[0], 2) | |||
| sod_ks = 0 | |||
| for i1, i2 in k_graph_pairs: | |||
| sod_ks += G_dis_mat[i1, i2] | |||
| return itr[0], itr[1], sod_ks | |||
| def split_iterable(iterable, n, len_iter): | |||
| it = iter(iterable) | |||
| for i in range(0, len_iter, n): | |||
| piece = islice(it, n) | |||
| yield piece | |||
| def get_combination_length(n, k): | |||
| len_combination = 1 | |||
| for i in range(n, n - k, -1): | |||
| len_combination *= i | |||
| return int(len_combination / math.factorial(k)) | |||
| ############################################################################### | |||
| def test_k_closest_graphs(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| # gkernel = 'untilhpathkernel' | |||
| # gkernel = 'weisfeilerlehmankernel' | |||
| gkernel = 'treeletkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| k = 5 | |||
| edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||
| # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
| # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
| # 'precomputed', edit_costs=edit_costs, | |||
| ## 'k-graphs', | |||
| # parallel=False) | |||
| # | |||
| # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
| # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
| # 'expert', parallel=False) | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
| 'expert', parallel=False) | |||
| return | |||
| def test_k_closest_graphs_with_cv(): | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| k = 4 | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| for y in y_all: | |||
| print('\n-------------------------------------------------------') | |||
| print('class of y:', y) | |||
| sod_sm_list.append([]) | |||
| sod_gm_list.append([]) | |||
| dis_k_sm_list.append([]) | |||
| dis_k_gm_list.append([]) | |||
| dis_k_gi_min_list.append([]) | |||
| for repeat in range(repeats): | |||
| print('\nrepeat ', repeat) | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, | |||
| k, 'whole-dataset', graph_dir=graph_dir, | |||
| parallel=False) | |||
| sod_sm_list[-1].append(sod_sm) | |||
| sod_gm_list[-1].append(sod_gm) | |||
| dis_k_sm_list[-1].append(dis_k_sm) | |||
| dis_k_gm_list[-1].append(dis_k_gm) | |||
| dis_k_gi_min_list[-1].append(dis_k_gi_min) | |||
| print('\nsods of the set median for this class:', sod_sm_list[-1]) | |||
| print('\nsods of the gen median for this class:', sod_gm_list[-1]) | |||
| print('\ndistances in kernel space of set median for this class:', | |||
| dis_k_sm_list[-1]) | |||
| print('\ndistances in kernel space of gen median for this class:', | |||
| dis_k_gm_list[-1]) | |||
| print('\ndistances in kernel space of min graph for this class:', | |||
| dis_k_gi_min_list[-1]) | |||
| sod_sm_list[-1] = np.mean(sod_sm_list[-1]) | |||
| sod_gm_list[-1] = np.mean(sod_gm_list[-1]) | |||
| dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) | |||
| dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) | |||
| dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) | |||
| print() | |||
| print('\nmean sods of the set median for each class:', sod_sm_list) | |||
| print('\nmean sods of the gen median for each class:', sod_gm_list) | |||
| print('\nmean distance in kernel space of set median for each class:', | |||
| dis_k_sm_list) | |||
| print('\nmean distances in kernel space of gen median for each class:', | |||
| dis_k_gm_list) | |||
| print('\nmean distances in kernel space of min graph for each class:', | |||
| dis_k_gi_min_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) | |||
| print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) | |||
| print('\nmean distances in kernel space of set median of all:', | |||
| np.mean(dis_k_sm_list)) | |||
| print('\nmean distances in kernel space of gen median of all:', | |||
| np.mean(dis_k_gm_list)) | |||
| print('\nmean distances in kernel space of min graph of all:', | |||
| np.mean(dis_k_gi_min_list)) | |||
| return | |||
| if __name__ == '__main__': | |||
| test_k_closest_graphs() | |||
| # test_k_closest_graphs_with_cv() | |||
| @@ -1,91 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Mar 16 17:26:40 2020 | |||
| @author: ljia | |||
| """ | |||
| def test_median_graph_estimator(): | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.median_graph_estimator import MedianGraphEstimator | |||
| from gklearn.gedlib import librariesImport, gedlibpy | |||
| from gklearn.preimage.utils import get_same_item_indices | |||
| from gklearn.preimage.ged import convertGraph | |||
| import multiprocessing | |||
| # estimator parameters. | |||
| init_type = 'MEDOID' | |||
| num_inits = 1 | |||
| threads = multiprocessing.cpu_count() | |||
| time_limit = 60000 | |||
| # algorithm parameters. | |||
| algo = 'IPFP' | |||
| initial_solutions = 40 | |||
| algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1' | |||
| edit_cost_name = 'LETTER2' | |||
| edit_cost_constants = [0.02987291, 0.0178211, 0.01431966, 0.001, 0.001] | |||
| ds_name = 'COIL-DEL' | |||
| # Load dataset. | |||
| # dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
| dataset = '../../datasets/Letter-high/Letter-high_A.txt' | |||
| Gn, y_all = loadDataset(dataset) | |||
| y_idx = get_same_item_indices(y_all) | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| Gn_i = [Gn[val] for val in values] | |||
| break | |||
| # Set up the environment. | |||
| ged_env = gedlibpy.GEDEnv() | |||
| # gedlibpy.restart_env() | |||
| ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants) | |||
| for G in Gn_i: | |||
| ged_env.add_nx_graph(convertGraph(G, edit_cost_name), '') | |||
| graph_ids = ged_env.get_all_graph_ids() | |||
| set_median_id = ged_env.add_graph('set_median') | |||
| gen_median_id = ged_env.add_graph('gen_median') | |||
| ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES') | |||
| # Set up the estimator. | |||
| mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name)) | |||
| mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1') | |||
| mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type | |||
| mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --refine FALSE'# @todo: std::to_string(rng()) | |||
| # Select the GED algorithm. | |||
| algo_options = '--threads ' + str(threads) + algo_options_suffix | |||
| mge.set_options(mge_options) | |||
| mge.set_init_method(algo, algo_options) | |||
| mge.set_descent_method(algo, algo_options) | |||
| # Run the estimator. | |||
| mge.run(graph_ids, set_median_id, gen_median_id) | |||
| # Get SODs. | |||
| sod_sm = mge.get_sum_of_distances('initialized') | |||
| sod_gm = mge.get_sum_of_distances('converged') | |||
| print('sod_sm, sod_gm: ', sod_sm, sod_gm) | |||
| # Get median graphs. | |||
| set_median = ged_env.get_nx_graph(set_median_id) | |||
| gen_median = ged_env.get_nx_graph(gen_median_id) | |||
| return set_median, gen_median | |||
| def constant_node_costs(edit_cost_name): | |||
| if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER': | |||
| return False | |||
| # elif edit_cost_name != '': | |||
| # # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests <AIDS|Mutagenicity|Letter-high|Letter-med|Letter-low|monoterpenoides|SYNTHETICnew|Fingerprint|COIL-DEL>"); | |||
| # return False | |||
| # return True | |||
| if __name__ == '__main__': | |||
| set_median, gen_median = test_median_graph_estimator() | |||
| @@ -1,686 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Jul 4 12:20:16 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.median import draw_Letter_graph | |||
| from gklearn.preimage.ged import GED, ged_median | |||
| from gklearn.preimage.utils import get_same_item_indices, compute_kernel, gram2distances, \ | |||
| dis_gstar, remove_edges | |||
| # --------------------------- These are tests --------------------------------# | |||
| def test_who_is_the_closest_in_kernel_space(Gn): | |||
| idx_gi = [0, 6] | |||
| g1 = Gn[idx_gi[0]] | |||
| g2 = Gn[idx_gi[1]] | |||
| # create the "median" graph. | |||
| gnew = g2.copy() | |||
| gnew.remove_node(0) | |||
| nx.draw_networkx(gnew) | |||
| plt.show() | |||
| print(gnew.nodes(data=True)) | |||
| Gn = [gnew] + Gn | |||
| # compute gram matrix | |||
| Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True) | |||
| # the distance matrix | |||
| dmatrix = gram2distances(Kmatrix) | |||
| print(np.sort(dmatrix[idx_gi[0] + 1])) | |||
| print(np.argsort(dmatrix[idx_gi[0] + 1])) | |||
| print(np.sort(dmatrix[idx_gi[1] + 1])) | |||
| print(np.argsort(dmatrix[idx_gi[1] + 1])) | |||
| # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2 | |||
| dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] | |||
| print(np.sort(dis_median)) | |||
| print(np.argsort(dis_median)) | |||
| return | |||
| def test_who_is_the_closest_in_GED_space(Gn): | |||
| idx_gi = [0, 6] | |||
| g1 = Gn[idx_gi[0]] | |||
| g2 = Gn[idx_gi[1]] | |||
| # create the "median" graph. | |||
| gnew = g2.copy() | |||
| gnew.remove_node(0) | |||
| nx.draw_networkx(gnew) | |||
| plt.show() | |||
| print(gnew.nodes(data=True)) | |||
| Gn = [gnew] + Gn | |||
| # compute GEDs | |||
| ged_matrix = np.zeros((len(Gn), len(Gn))) | |||
| for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||
| for i2 in range(len(Gn)): | |||
| dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib') | |||
| ged_matrix[i1, i2] = dis | |||
| print(np.sort(ged_matrix[idx_gi[0] + 1])) | |||
| print(np.argsort(ged_matrix[idx_gi[0] + 1])) | |||
| print(np.sort(ged_matrix[idx_gi[1] + 1])) | |||
| print(np.argsort(ged_matrix[idx_gi[1] + 1])) | |||
| # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2 | |||
| dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] | |||
| print(np.sort(dis_median)) | |||
| print(np.argsort(dis_median)) | |||
| return | |||
| def test_will_IAM_give_the_median_graph_we_wanted(Gn): | |||
| idx_gi = [0, 6] | |||
| g1 = Gn[idx_gi[0]].copy() | |||
| g2 = Gn[idx_gi[1]].copy() | |||
| # del Gn[idx_gi[0]] | |||
| # del Gn[idx_gi[1] - 1] | |||
| g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) | |||
| # g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1) | |||
| nx.draw_networkx(g_median) | |||
| plt.show() | |||
| print(g_median.nodes(data=True)) | |||
| print(g_median.edges(data=True)) | |||
| def test_new_IAM_allGraph_deleteNodes(Gn): | |||
| idx_gi = [0, 6] | |||
| # g1 = Gn[idx_gi[0]].copy() | |||
| # g2 = Gn[idx_gi[1]].copy() | |||
| # g1 = nx.Graph(name='haha') | |||
| # g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})]) | |||
| # g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})]) | |||
| # g2 = nx.Graph(name='hahaha') | |||
| # g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}), | |||
| # (3, {'atom': 'O'}), (4, {'atom': 'C'})]) | |||
| # g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| # (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) | |||
| g1 = nx.Graph(name='haha') | |||
| g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), | |||
| (3, {'atom': 'S'}), (4, {'atom': 'S'})]) | |||
| g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) | |||
| g2 = nx.Graph(name='hahaha') | |||
| g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), | |||
| (3, {'atom': 'O'}), (4, {'atom': 'O'})]) | |||
| g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) | |||
| # g2 = g1.copy() | |||
| # g2.add_nodes_from([(3, {'atom': 'O'})]) | |||
| # g2.add_nodes_from([(4, {'atom': 'C'})]) | |||
| # g2.add_edges_from([(1, 3, {'bond_type': '1'})]) | |||
| # g2.add_edges_from([(3, 4, {'bond_type': '1'})]) | |||
| # del Gn[idx_gi[0]] | |||
| # del Gn[idx_gi[1] - 1] | |||
| nx.draw_networkx(g1) | |||
| plt.show() | |||
| print(g1.nodes(data=True)) | |||
| print(g1.edges(data=True)) | |||
| nx.draw_networkx(g2) | |||
| plt.show() | |||
| print(g2.nodes(data=True)) | |||
| print(g2.edges(data=True)) | |||
| g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) | |||
| # g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1) | |||
| nx.draw_networkx(g_median) | |||
| plt.show() | |||
| print(g_median.nodes(data=True)) | |||
| print(g_median.edges(data=True)) | |||
| def test_the_simple_two(Gn, gkernel): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # recursions | |||
| l = 500 | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 2 # k nearest neighbors | |||
| # randomly select two molecules | |||
| np.random.seed(1) | |||
| idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) | |||
| g1 = Gn[idx_gi[0]] | |||
| g2 = Gn[idx_gi[1]] | |||
| Gn_mix = [g.copy() for g in Gn] | |||
| Gn_mix.append(g1.copy()) | |||
| Gn_mix.append(g2.copy()) | |||
| # g_tmp = iam([g1, g2]) | |||
| # nx.draw_networkx(g_tmp) | |||
| # plt.show() | |||
| # compute | |||
| # k_list = [] # kernel between each graph and itself. | |||
| # k_g1_list = [] # kernel between each graph and g1 | |||
| # k_g2_list = [] # kernel between each graph and g2 | |||
| # for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): | |||
| # ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False) | |||
| # k_list.append(ktemp[0][0, 0]) | |||
| # k_g1_list.append(ktemp[0][0, 1]) | |||
| # k_g2_list.append(ktemp[0][0, 2]) | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| # k_list = np.diag(km) # kernel between each graph and itself. | |||
| # k_g1_list = km[idx_gi[0]] # kernel between each graph and g1 | |||
| # k_g2_list = km[idx_gi[1]] # kernel between each graph and g2 | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], | |||
| range(len(Gn), len(Gn) + 2), km, | |||
| k, r_max,gkernel) | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| nx.draw_networkx(g) | |||
| plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| def test_remove_bests(Gn, gkernel): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # recursions | |||
| l = 500 | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 20 # k nearest neighbors | |||
| # randomly select two molecules | |||
| np.random.seed(1) | |||
| idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) | |||
| g1 = Gn[idx_gi[0]] | |||
| g2 = Gn[idx_gi[1]] | |||
| # remove the best 2 graphs. | |||
| del Gn[idx_gi[0]] | |||
| del Gn[idx_gi[1] - 1] | |||
| # del Gn[8] | |||
| Gn_mix = [g.copy() for g in Gn] | |||
| Gn_mix.append(g1.copy()) | |||
| Gn_mix.append(g2.copy()) | |||
| # compute | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], | |||
| range(len(Gn), len(Gn) + 2), km, | |||
| k, r_max, gkernel) | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| draw_Letter_graph(g) | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| ############################################################################### | |||
| # Tests on dataset Letter-H. | |||
| def test_gkiam_letter_h(): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gkernel = 'structuralspkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 3 # recursions | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 10 # k nearest neighbors | |||
| # classify graphs according to letters. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| time_list = [] | |||
| sod_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| for letter in idx_dict: | |||
| print('\n-------------------------------------------------------\n') | |||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||
| Gn_mix = Gn_let + [g.copy() for g in Gn_let] | |||
| alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) | |||
| # compute | |||
| time0 = time.time() | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, | |||
| Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)), | |||
| km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7, | |||
| ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter') | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| time_list.append(time.time() - time0) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. (alpha range not considered.) | |||
| sod_tmp, _ = ged_median(g_best[0], Gn_let, ged_cost='LETTER', | |||
| ged_method='IPFP', saveGXL='gedlib-letter') | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| sod_ks_min_list.append(sod_ks) | |||
| nb_updated_list.append(nb_updated) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list) | |||
| print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) | |||
| print('\nnumber of updates for each letter: ', nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| #def compute_letter_median_by_average(Gn): | |||
| # return g_median | |||
| def test_iam_letter_h(): | |||
| from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| lmbda = 0.03 # termination probalility | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| # classify graphs according to letters. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| time_list = [] | |||
| sod_list = [] | |||
| sod_min_list = [] | |||
| for letter in idx_dict: | |||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||
| alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) | |||
| # compute | |||
| g_best = [] | |||
| dis_best = [] | |||
| time0 = time.time() | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( | |||
| Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7, | |||
| ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter') | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| time_list.append(time.time() - time0) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| draw_Letter_graph(g, savepath='results/iam/') | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| # compute the corresponding sod in kernel space. (alpha range not considered.) | |||
| gkernel = 'structuralspkernel' | |||
| sod_tmp = [] | |||
| Gn_mix = g_best[0] + Gn_let | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| for ig, g in tqdm(enumerate(g_best[0]), desc='computing kernel sod', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, range(len(g_best[0]), len(Gn_mix)), | |||
| [alpha_range[0]] * len(Gn_let), km, withterm3=False) | |||
| sod_tmp.append(dtemp) | |||
| sod_list.append(sod_tmp) | |||
| sod_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in kernel space: ', sod_list) | |||
| print('\nsmallest sod in kernel space for each letter: ', sod_min_list) | |||
| print('\ntimes:', time_list) | |||
| def test_random_preimage_letter_h(): | |||
| from preimage_random import preimage_random | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||
| # 'extra_params': {}} | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| # 'extra_params': {}} # node symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gkernel = 'structuralspkernel' | |||
| # lmbda = 0.03 # termination probalility | |||
| r_max = 3 # 10 # recursions | |||
| l = 500 | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| #alpha_range = np.linspace(0.1, 0.9, 9) | |||
| k = 10 # 5 # k nearest neighbors | |||
| # classify graphs according to letters. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| time_list = [] | |||
| sod_list = [] | |||
| sod_min_list = [] | |||
| for letter in idx_dict: | |||
| print('\n-------------------------------------------------------\n') | |||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||
| Gn_mix = Gn_let + [g.copy() for g in Gn_let] | |||
| alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) | |||
| # compute | |||
| time0 = time.time() | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| dhat, ghat_list = preimage_random(Gn_let, Gn_let, [alpha] * len(Gn_let), | |||
| range(len(Gn_let), len(Gn_mix)), km, | |||
| k, r_max, gkernel, c_ei=1.7, | |||
| c_er=1.7, c_es=1.7) | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| time_list.append(time.time() - time0) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. (alpha range not considered.) | |||
| sod_tmp, _ = ged_median(g_best[0], Gn_let) | |||
| sod_list.append(sod_tmp) | |||
| sod_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_list) | |||
| print('\nsmallest sod in graph space for each letter: ', sod_min_list) | |||
| print('\ntimes:', time_list) | |||
| def test_gkiam_mutag(): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| gkernel = 'structuralspkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 3 # recursions | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 20 # k nearest neighbors | |||
| # classify graphs according to letters. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| time_list = [] | |||
| sod_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| for letter in idx_dict: | |||
| print('\n-------------------------------------------------------\n') | |||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||
| Gn_mix = Gn_let + [g.copy() for g in Gn_let] | |||
| alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) | |||
| # compute | |||
| time0 = time.time() | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| g_best = [] | |||
| dis_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('alpha =', alpha) | |||
| dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let), | |||
| range(len(Gn_let), len(Gn_mix)), km, | |||
| k, r_max, gkernel, c_ei=1.7, | |||
| c_er=1.7, c_es=1.7) | |||
| dis_best.append(dhat) | |||
| g_best.append(ghat_list) | |||
| time_list.append(time.time() - time0) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
| print('the corresponding pre-images are') | |||
| for g in g_best[idx]: | |||
| draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| # nx.draw_networkx(g) | |||
| # plt.show() | |||
| print(g.nodes(data=True)) | |||
| print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. (alpha range not considered.) | |||
| sod_tmp, _ = ged_median(g_best[0], Gn_let) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| sod_ks_min_list.append(sod_ks) | |||
| nb_updated_list.append(nb_updated) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list) | |||
| print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) | |||
| print('\nnumber of updates for each letter: ', nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # Re-test. | |||
| def retest_the_simple_two(): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| # The two simple graphs. | |||
| # g1 = nx.Graph(name='haha') | |||
| # g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})]) | |||
| # g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})]) | |||
| # g2 = nx.Graph(name='hahaha') | |||
| # g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}), | |||
| # (3, {'atom': 'O'}), (4, {'atom': 'C'})]) | |||
| # g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| # (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) | |||
| g1 = nx.Graph(name='haha') | |||
| g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), | |||
| (3, {'atom': 'S'}), (4, {'atom': 'S'})]) | |||
| g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) | |||
| g2 = nx.Graph(name='hahaha') | |||
| g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), | |||
| (3, {'atom': 'O'}), (4, {'atom': 'O'})]) | |||
| g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||
| (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) | |||
| # # randomly select two molecules | |||
| # np.random.seed(1) | |||
| # idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) | |||
| # g1 = Gn[idx_gi[0]] | |||
| # g2 = Gn[idx_gi[1]] | |||
| # Gn_mix = [g.copy() for g in Gn] | |||
| # Gn_mix.append(g1.copy()) | |||
| # Gn_mix.append(g2.copy()) | |||
| Gn = [g1.copy(), g2.copy()] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # recursions | |||
| # l = 500 | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 2 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| Gn_mix = Gn + [g1.copy(), g2.copy()] | |||
| # compute | |||
| time0 = time.time() | |||
| km = compute_kernel(Gn_mix, gkernel, True) | |||
| time_km = time.time() - time0 | |||
| time_list = [] | |||
| sod_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| g_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('\n-------------------------------------------------------\n') | |||
| print('alpha =', alpha) | |||
| time0 = time.time() | |||
| dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2], | |||
| [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, | |||
| gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, | |||
| ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| sod_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| nb_updated_list.append(nb_updated) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx]) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG") | |||
| plt.show() | |||
| print(g_best[idx][0].nodes(data=True)) | |||
| print(g_best[idx][0].edges(data=True)) | |||
| # for g in g_best[idx]: | |||
| # draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| ## nx.draw_networkx(g) | |||
| ## plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| for idx, item in enumerate(alpha_range): | |||
| sod_tmp, _ = ged_median(g_best[0], [g1, g2], ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) | |||
| print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list) | |||
| print('\nnumber of updates for each alpha: ', nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| if __name__ == '__main__': | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
| # 'extra_params': {}} # node nsymb | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||
| # 'extra_params': {}} | |||
| # ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| # 'extra_params': {}} # node symb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:20] | |||
| # import networkx.algorithms.isomorphism as iso | |||
| # G1 = nx.MultiDiGraph() | |||
| # G2 = nx.MultiDiGraph() | |||
| # G1.add_nodes_from([1,2,3], fill='red') | |||
| # G2.add_nodes_from([10,20,30,40], fill='red') | |||
| # nx.add_path(G1, [1,2,3,4], weight=3, linewidth=2.5) | |||
| # nx.add_path(G2, [10,20,30,40], weight=3) | |||
| # nm = iso.categorical_node_match('fill', 'red') | |||
| # print(nx.is_isomorphic(G1, G2, node_match=nm)) | |||
| # | |||
| # test_new_IAM_allGraph_deleteNodes(Gn) | |||
| # test_will_IAM_give_the_median_graph_we_wanted(Gn) | |||
| # test_who_is_the_closest_in_GED_space(Gn) | |||
| # test_who_is_the_closest_in_kernel_space(Gn) | |||
| # test_the_simple_two(Gn, 'untilhpathkernel') | |||
| # test_remove_bests(Gn, 'untilhpathkernel') | |||
| # test_gkiam_letter_h() | |||
| # test_iam_letter_h() | |||
| # test_random_preimage_letter_h | |||
| ############################################################################### | |||
| # retests. | |||
| retest_the_simple_two() | |||
| @@ -1,620 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Sep 5 15:59:00 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| import random | |||
| #from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices | |||
| from gklearn.preimage.ged import ged_median | |||
| from gklearn.preimage.preimage_iam import preimage_iam | |||
| ############################################################################### | |||
| # tests on different values on grid of median-sets and k. | |||
| def test_preimage_iam_grid_k_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 5 # iteration limit for pre-image. | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| # k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = True | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # parameters for IAM function | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| # number of nearest neighbors. | |||
| k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| nb_updated_k_list = [] | |||
| g_best = [] | |||
| for idx_nb, nb_median in enumerate(nb_median_range): | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| sod_gs_list.append([]) | |||
| sod_gs_min_list.append([]) | |||
| nb_updated_list.append([]) | |||
| nb_updated_k_list.append([]) | |||
| g_best.append([]) | |||
| for k in k_range: | |||
| print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |||
| print('k =', k) | |||
| time0 = time.time() | |||
| dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \ | |||
| preimage_iam(Gn, Gn_median, | |||
| alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, | |||
| gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||
| 'saveGXL': saveGXL}) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list[idx_nb].append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list[idx_nb].append(dhat) | |||
| g_best[idx_nb].append(ghat_list) | |||
| print('\nnumber of updates of the best graph by IAM: ', nb_updated) | |||
| nb_updated_list[idx_nb].append(nb_updated) | |||
| print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k) | |||
| nb_updated_k_list[idx_nb].append(nb_updated_k) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + | |||
| '_k' + str(k) + '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list[idx_nb].append(sod_tmp) | |||
| sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs and k: ', | |||
| sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs and k: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', | |||
| nb_updated_list) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', | |||
| nb_updated_k_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| def test_preimage_iam_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 3 # iteration limit for pre-image. | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = True | |||
| # parameters for IAM function | |||
| # c_vi = 0.037 | |||
| # c_vr = 0.038 | |||
| # c_vs = 0.075 | |||
| # c_ei = 0.001 | |||
| # c_er = 0.001 | |||
| # c_es = 0.0 | |||
| c_vi = 4 | |||
| c_vr = 4 | |||
| c_vs = 2 | |||
| c_ei = 1 | |||
| c_er = 1 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| # parameters for GED function | |||
| # ged_cost='CHEM_1' | |||
| ged_cost = 'CONSTANT' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| ged_stabilizer = 'min' | |||
| ged_repeat = 50 | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| nb_median_range = [2] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| nb_updated_k_list = [] | |||
| g_best = [] | |||
| for nb_median in nb_median_range: | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time0 = time.time() | |||
| dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \ | |||
| preimage_iam(Gn, Gn_median, | |||
| alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, | |||
| gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 + time_km | |||
| print('\ntime: ', time_total) | |||
| time_list.append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| print('\nnumber of updates of the best graph: ', nb_updated) | |||
| nb_updated_list.append(nb_updated) | |||
| print('\nnumber of updates of k nearest graphs: ', nb_updated_k) | |||
| nb_updated_k_list.append(nb_updated_k) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.show() | |||
| # plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + | |||
| # '.png', format="PNG") | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||
| nb_updated_list) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||
| nb_updated_k_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| def test_gkiam_2combination_all_pairs(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # iteration limit for pre-image. | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = False | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # parameters for IAM function | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| nb_update_mat = np.full((len(Gn), len(Gn)), np.inf) | |||
| # test on each pair of graphs. | |||
| # for idx1 in range(len(Gn) - 1, -1, -1): | |||
| # for idx2 in range(idx1, -1, -1): | |||
| for idx1 in range(187, 188): | |||
| for idx2 in range(167, 168): | |||
| g1 = Gn[idx1].copy() | |||
| g2 = Gn[idx2].copy() | |||
| # Gn[10] = [] | |||
| # Gn[10] = [] | |||
| nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) | |||
| plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG") | |||
| plt.show() | |||
| plt.clf() | |||
| nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) | |||
| plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG") | |||
| plt.show() | |||
| plt.clf() | |||
| ################################################################### | |||
| # Gn_mix = [g.copy() for g in Gn] | |||
| # Gn_mix.append(g1.copy()) | |||
| # Gn_mix.append(g2.copy()) | |||
| # | |||
| # # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # | |||
| # # write Gram matrix to file and read it. | |||
| # np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km) | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') | |||
| km = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| for i in range(len(Gn)): | |||
| km[i, len(Gn)] = km[i, idx1] | |||
| km[i, len(Gn) + 1] = km[i, idx2] | |||
| km[len(Gn), i] = km[i, idx1] | |||
| km[len(Gn) + 1, i] = km[i, idx2] | |||
| km[len(Gn), len(Gn)] = km[idx1, idx1] | |||
| km[len(Gn), len(Gn) + 1] = km[idx1, idx2] | |||
| km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] | |||
| km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] | |||
| ################################################################### | |||
| # # use only the two graphs in median set as candidates. | |||
| # Gn = [g1.copy(), g2.copy()] | |||
| # Gn_mix = Gn + [g1.copy(), g2.copy()] | |||
| # # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| nb_updated_k_list = [] | |||
| g_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('\n-------------------------------------------------------\n') | |||
| print('alpha =', alpha) | |||
| time0 = time.time() | |||
| dhat, ghat_list, sod_ks, nb_updated, nb_updated_k = \ | |||
| preimage_iam(Gn, [g1, g2], | |||
| [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, | |||
| gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||
| 'saveGXL': saveGXL}) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| nb_updated_list.append(nb_updated) | |||
| nb_updated_k_list.append(nb_updated_k) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2) | |||
| + '_alpha' + str(item) + '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(g_best[idx][0].nodes(data=True)) | |||
| # print(g_best[idx][0].edges(data=True)) | |||
| # for g in g_best[idx]: | |||
| # draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| ## nx.draw_networkx(g) | |||
| ## plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| for idx, item in enumerate(alpha_range): | |||
| sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each alpha: ', | |||
| nb_updated_list) | |||
| print('\nnumber of updates of the k nearest graphs for each alpha: ', | |||
| nb_updated_k_list) | |||
| print('\ntimes:', time_list) | |||
| nb_update_mat[idx1, idx2] = nb_updated_list[0] | |||
| str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0]) | |||
| with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file: | |||
| content = file.read() | |||
| file.seek(0, 0) | |||
| file.write(str_fw + content) | |||
| def test_gkiam_2combination(): | |||
| from gk_iam import gk_iam_nearest_multi | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # iteration limit for pre-image. | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 20 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| # randomly select two molecules | |||
| np.random.seed(1) | |||
| idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2) | |||
| g1 = Gn[idx_gi[0]].copy() | |||
| g2 = Gn[idx_gi[1]].copy() | |||
| # Gn[10] = [] | |||
| # Gn[10] = [] | |||
| # nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) | |||
| # plt.savefig("results/random_preimage/mutag10.png", format="PNG") | |||
| # plt.show() | |||
| # nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) | |||
| # plt.savefig("results/random_preimage/mutag11.png", format="PNG") | |||
| # plt.show() | |||
| Gn_mix = [g.copy() for g in Gn] | |||
| Gn_mix.append(g1.copy()) | |||
| Gn_mix.append(g2.copy()) | |||
| # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # write Gram matrix to file and read it. | |||
| # np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km) | |||
| gmfile = np.load('results/gram_matrix.gm.npz') | |||
| km = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| g_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('\n-------------------------------------------------------\n') | |||
| print('alpha =', alpha) | |||
| time0 = time.time() | |||
| dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2], | |||
| [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, | |||
| gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, | |||
| ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| nb_updated_list.append(nb_updated) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG") | |||
| plt.show() | |||
| print(g_best[idx][0].nodes(data=True)) | |||
| print(g_best[idx][0].edges(data=True)) | |||
| # for g in g_best[idx]: | |||
| # draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| ## nx.draw_networkx(g) | |||
| ## plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| for idx, item in enumerate(alpha_range): | |||
| sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) | |||
| print('\nnumber of updates for each alpha: ', nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| if __name__ == '__main__': | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| # test_gkiam_2combination() | |||
| # test_gkiam_2combination_all_pairs() | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| test_preimage_iam_median_nb() | |||
| ############################################################################### | |||
| # tests on different values on grid of median-sets and k. | |||
| # test_preimage_iam_grid_k_median_nb() | |||
| @@ -1,539 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Sep 5 15:59:00 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| import random | |||
| #from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.ged import ged_median | |||
| from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges | |||
| from gklearn.preimage.preimage_iam import preimage_iam_random_mix | |||
| ############################################################################### | |||
| # tests on different values on grid of median-sets and k. | |||
| def test_preimage_mix_grid_k_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 5 # iteration limit for pre-image. | |||
| l_max = 500 # update limit for random generation | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| # k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = True | |||
| InitRandomWithAllDk = True | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # parameters for IAM function | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| # number of nearest neighbors. | |||
| k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list_iam = [] | |||
| nb_updated_list_random = [] | |||
| nb_updated_k_list_iam = [] | |||
| nb_updated_k_list_random = [] | |||
| g_best = [] | |||
| for idx_nb, nb_median in enumerate(nb_median_range): | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| sod_gs_list.append([]) | |||
| sod_gs_min_list.append([]) | |||
| nb_updated_list_iam.append([]) | |||
| nb_updated_list_random.append([]) | |||
| nb_updated_k_list_iam.append([]) | |||
| nb_updated_k_list_random.append([]) | |||
| g_best.append([]) | |||
| for k in k_range: | |||
| print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |||
| print('k =', k) | |||
| time0 = time.time() | |||
| dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ | |||
| nb_updated_k_iam, nb_updated_k_random = \ | |||
| preimage_iam_random_mix(Gn, Gn_median, | |||
| alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, | |||
| l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| InitRandomWithAllDk=InitRandomWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||
| 'saveGXL': saveGXL}) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list[idx_nb].append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list[idx_nb].append(dhat) | |||
| g_best[idx_nb].append(ghat_list) | |||
| print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam) | |||
| nb_updated_list_iam[idx_nb].append(nb_updated_iam) | |||
| print('\nnumber of updates of the best graph by random generation: ', | |||
| nb_updated_random) | |||
| nb_updated_list_random[idx_nb].append(nb_updated_random) | |||
| print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam) | |||
| nb_updated_k_list_iam[idx_nb].append(nb_updated_k_iam) | |||
| print('\nnumber of updates of k nearest graphs by random generation: ', | |||
| nb_updated_k_random) | |||
| nb_updated_k_list_random[idx_nb].append(nb_updated_k_random) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + | |||
| '_k' + str(k) + '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list[idx_nb].append(sod_tmp) | |||
| sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs and k: ', | |||
| sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs and k: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', | |||
| nb_updated_list_iam) | |||
| print('\nnumber of updates of the best graph for each set of median graphs and k by random generation: ', | |||
| nb_updated_list_random) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', | |||
| nb_updated_k_list_iam) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs and k by random generation: ', | |||
| nb_updated_k_list_random) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| def test_preimage_mix_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 5 # iteration limit for pre-image. | |||
| l_max = 500 # update limit for random generation | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = True | |||
| InitRandomWithAllDk = True | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # parameters for IAM function | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list_iam = [] | |||
| nb_updated_list_random = [] | |||
| nb_updated_k_list_iam = [] | |||
| nb_updated_k_list_random = [] | |||
| g_best = [] | |||
| for nb_median in nb_median_range: | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time0 = time.time() | |||
| dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ | |||
| nb_updated_k_iam, nb_updated_k_random = \ | |||
| preimage_iam_random_mix(Gn, Gn_median, | |||
| alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, | |||
| l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| InitRandomWithAllDk=InitRandomWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||
| 'saveGXL': saveGXL}) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam) | |||
| nb_updated_list_iam.append(nb_updated_iam) | |||
| print('\nnumber of updates of the best graph by random generation: ', | |||
| nb_updated_random) | |||
| nb_updated_list_random.append(nb_updated_random) | |||
| print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam) | |||
| nb_updated_k_list_iam.append(nb_updated_k_iam) | |||
| print('\nnumber of updates of k nearest graphs by random generation: ', | |||
| nb_updated_k_random) | |||
| nb_updated_k_list_random.append(nb_updated_k_random) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + | |||
| '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||
| nb_updated_list_iam) | |||
| print('\nnumber of updates of the best graph for each set of median graphs by random generation: ', | |||
| nb_updated_list_random) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||
| nb_updated_k_list_iam) | |||
| print('\nnumber of updates of k nearest graphs for each set of median graphs by random generation: ', | |||
| nb_updated_k_list_random) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| def test_preimage_mix_2combination_all_pairs(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # iteration limit for pre-image. | |||
| l_max = 500 # update limit for random generation | |||
| alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 5 # k nearest neighbors | |||
| epsilon = 1e-6 | |||
| InitIAMWithAllDk = True | |||
| InitRandomWithAllDk = True | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # parameters for IAM function | |||
| c_ei=1 | |||
| c_er=1 | |||
| c_es=1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.001 | |||
| removeNodes = True | |||
| connected_iam = False | |||
| nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf) | |||
| nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf) | |||
| # test on each pair of graphs. | |||
| # for idx1 in range(len(Gn) - 1, -1, -1): | |||
| # for idx2 in range(idx1, -1, -1): | |||
| for idx1 in range(187, 188): | |||
| for idx2 in range(167, 168): | |||
| g1 = Gn[idx1].copy() | |||
| g2 = Gn[idx2].copy() | |||
| # Gn[10] = [] | |||
| # Gn[10] = [] | |||
| nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) | |||
| plt.savefig("results/preimage_mix/mutag187.png", format="PNG") | |||
| plt.show() | |||
| plt.clf() | |||
| nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) | |||
| plt.savefig("results/preimage_mix/mutag167.png", format="PNG") | |||
| plt.show() | |||
| plt.clf() | |||
| ################################################################### | |||
| # Gn_mix = [g.copy() for g in Gn] | |||
| # Gn_mix.append(g1.copy()) | |||
| # Gn_mix.append(g2.copy()) | |||
| # | |||
| # # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # | |||
| # # write Gram matrix to file and read it. | |||
| # np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km) | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') | |||
| km = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| for i in range(len(Gn)): | |||
| km[i, len(Gn)] = km[i, idx1] | |||
| km[i, len(Gn) + 1] = km[i, idx2] | |||
| km[len(Gn), i] = km[i, idx1] | |||
| km[len(Gn) + 1, i] = km[i, idx2] | |||
| km[len(Gn), len(Gn)] = km[idx1, idx1] | |||
| km[len(Gn), len(Gn) + 1] = km[idx1, idx2] | |||
| km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] | |||
| km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] | |||
| ################################################################### | |||
| # # use only the two graphs in median set as candidates. | |||
| # Gn = [g1.copy(), g2.copy()] | |||
| # Gn_mix = Gn + [g1.copy(), g2.copy()] | |||
| # # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list_iam = [] | |||
| nb_updated_list_random = [] | |||
| nb_updated_k_list_iam = [] | |||
| nb_updated_k_list_random = [] | |||
| g_best = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('\n-------------------------------------------------------\n') | |||
| print('alpha =', alpha) | |||
| time0 = time.time() | |||
| dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ | |||
| nb_updated_k_iam, nb_updated_k_random = \ | |||
| preimage_iam_random_mix(Gn, [g1, g2], | |||
| [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, | |||
| l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, | |||
| InitRandomWithAllDk=InitRandomWithAllDk, | |||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | |||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | |||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | |||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||
| 'saveGXL': saveGXL}) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat_list) | |||
| nb_updated_list_iam.append(nb_updated_iam) | |||
| nb_updated_list_random.append(nb_updated_random) | |||
| nb_updated_k_list_iam.append(nb_updated_k_iam) | |||
| nb_updated_k_list_random.append(nb_updated_k_random) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2) | |||
| + '_alpha' + str(item) + '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(g_best[idx][0].nodes(data=True)) | |||
| # print(g_best[idx][0].edges(data=True)) | |||
| # for g in g_best[idx]: | |||
| # draw_Letter_graph(g, savepath='results/gk_iam/') | |||
| ## nx.draw_networkx(g) | |||
| ## plt.show() | |||
| # print(g.nodes(data=True)) | |||
| # print(g.edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| for idx, item in enumerate(alpha_range): | |||
| sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each alpha by IAM: ', nb_updated_list_iam) | |||
| print('\nnumber of updates of the best graph for each alpha by random generation: ', | |||
| nb_updated_list_random) | |||
| print('\nnumber of updates of k nearest graphs for each alpha by IAM: ', | |||
| nb_updated_k_list_iam) | |||
| print('\nnumber of updates of k nearest graphs for each alpha by random generation: ', | |||
| nb_updated_k_list_random) | |||
| print('\ntimes:', time_list) | |||
| nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0] | |||
| nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0] | |||
| str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \ | |||
| % (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0]) | |||
| with open('results/preimage_mix/nb_updates.txt', 'r+') as file: | |||
| content = file.read() | |||
| file.seek(0, 0) | |||
| file.write(str_fw + content) | |||
| ############################################################################### | |||
| if __name__ == '__main__': | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| # test_preimage_mix_2combination_all_pairs() | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| # test_preimage_mix_median_nb() | |||
| ############################################################################### | |||
| # tests on different values on grid of median-sets and k. | |||
| test_preimage_mix_grid_k_median_nb() | |||
| @@ -1,398 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Sep 5 15:59:00 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| import random | |||
| #from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset | |||
| from gklearn.preimage.preimage_random import preimage_random | |||
| from gklearn.preimage.ged import ged_median | |||
| from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges | |||
| ############################################################################### | |||
| # tests on different values on grid of median-sets and k. | |||
| def test_preimage_random_grid_k_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 5 # iteration limit for pre-image. | |||
| l = 500 # update limit for random generation | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| # k = 5 # k nearest neighbors | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| # number of nearest neighbors. | |||
| k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| g_best = [] | |||
| for idx_nb, nb_median in enumerate(nb_median_range): | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| sod_gs_list.append([]) | |||
| sod_gs_min_list.append([]) | |||
| nb_updated_list.append([]) | |||
| g_best.append([]) | |||
| for k in k_range: | |||
| print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |||
| print('k =', k) | |||
| time0 = time.time() | |||
| dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, | |||
| range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list[idx_nb].append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list[idx_nb].append(dhat) | |||
| g_best[idx_nb].append(ghat) | |||
| print('\nnumber of updates of the best graph: ', nb_updated) | |||
| nb_updated_list[idx_nb].append(nb_updated) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + | |||
| '_k' + str(k) + '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list[idx_nb].append(sod_tmp) | |||
| sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs and k: ', | |||
| sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs and k: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', | |||
| nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # tests on different numbers of median-sets. | |||
| def test_preimage_random_median_nb(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:50] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 5 # iteration limit for pre-image. | |||
| l = 500 # update limit for random generation | |||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||
| k = 5 # k nearest neighbors | |||
| # parameters for GED function | |||
| ged_cost='CHEM_1' | |||
| ged_method='IPFP' | |||
| saveGXL='gedlib' | |||
| # number of graphs; we what to compute the median of these graphs. | |||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||
| # find out all the graphs classified to positive group 1. | |||
| idx_dict = get_same_item_indices(y_all) | |||
| Gn = [Gn[i] for i in idx_dict[1]] | |||
| # # compute Gram matrix. | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| # # write Gram matrix to file. | |||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| sod_gs_list = [] | |||
| sod_gs_min_list = [] | |||
| nb_updated_list = [] | |||
| g_best = [] | |||
| for nb_median in nb_median_range: | |||
| print('\n-------------------------------------------------------') | |||
| print('number of median graphs =', nb_median) | |||
| random.seed(1) | |||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||
| print('graphs chosen:', idx_rdm) | |||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||
| # for g in Gn_median: | |||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||
| # plt.show() | |||
| # plt.clf() | |||
| ################################################################### | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||
| km_tmp = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| km[i, j] = km_tmp[i, j] | |||
| km[j, i] = km[i, j] | |||
| for i in range(len(Gn)): | |||
| for j, idx in enumerate(idx_rdm): | |||
| km[i, len(Gn) + j] = km[i, idx] | |||
| km[len(Gn) + j, i] = km[i, idx] | |||
| for i, idx1 in enumerate(idx_rdm): | |||
| for j, idx2 in enumerate(idx_rdm): | |||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||
| ################################################################### | |||
| alpha_range = [1 / nb_median] * nb_median | |||
| time0 = time.time() | |||
| dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, | |||
| range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| print('\nsmallest distance in kernel space: ', dhat) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat) | |||
| print('\nnumber of updates of the best graph: ', nb_updated) | |||
| nb_updated_list.append(nb_updated) | |||
| # show the best graph and save it to file. | |||
| print('the shortest distance is', dhat) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), | |||
| with_labels=True) | |||
| plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + | |||
| '.png', format="PNG") | |||
| # plt.show() | |||
| plt.clf() | |||
| # print(ghat_list[0].nodes(data=True)) | |||
| # print(ghat_list[0].edges(data=True)) | |||
| # compute the corresponding sod in graph space. | |||
| sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, | |||
| ged_method=ged_method, saveGXL=saveGXL) | |||
| sod_gs_list.append(sod_tmp) | |||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||
| print('\nsods in graph space: ', sod_gs_list) | |||
| print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||
| dis_ks_min_list) | |||
| print('\nnumber of updates of the best graph for each set of median graphs: ', | |||
| nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| def test_random_preimage_2combination(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # Gn = Gn[0:12] | |||
| remove_edges(Gn) | |||
| gkernel = 'marginalizedkernel' | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel) | |||
| # print(dis_max, dis_min, dis_mean) | |||
| lmbda = 0.03 # termination probalility | |||
| r_max = 10 # iteration limit for pre-image. | |||
| l = 500 | |||
| alpha_range = np.linspace(0, 1, 11) | |||
| k = 5 # k nearest neighbors | |||
| # randomly select two molecules | |||
| np.random.seed(1) | |||
| idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2) | |||
| g1 = Gn[idx_gi[0]].copy() | |||
| g2 = Gn[idx_gi[1]].copy() | |||
| # nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) | |||
| # plt.savefig("results/random_preimage/mutag10.png", format="PNG") | |||
| # plt.show() | |||
| # nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) | |||
| # plt.savefig("results/random_preimage/mutag11.png", format="PNG") | |||
| # plt.show() | |||
| ###################################################################### | |||
| # Gn_mix = [g.copy() for g in Gn] | |||
| # Gn_mix.append(g1.copy()) | |||
| # Gn_mix.append(g2.copy()) | |||
| # | |||
| ## g_tmp = iam([g1, g2]) | |||
| ## nx.draw_networkx(g_tmp) | |||
| ## plt.show() | |||
| # | |||
| # # compute | |||
| # time0 = time.time() | |||
| # km = compute_kernel(Gn_mix, gkernel, True) | |||
| # time_km = time.time() - time0 | |||
| ################################################################### | |||
| idx1 = idx_gi[0] | |||
| idx2 = idx_gi[1] | |||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') | |||
| km = gmfile['gm'] | |||
| time_km = gmfile['gmtime'] | |||
| # modify mixed gram matrix. | |||
| for i in range(len(Gn)): | |||
| km[i, len(Gn)] = km[i, idx1] | |||
| km[i, len(Gn) + 1] = km[i, idx2] | |||
| km[len(Gn), i] = km[i, idx1] | |||
| km[len(Gn) + 1, i] = km[i, idx2] | |||
| km[len(Gn), len(Gn)] = km[idx1, idx1] | |||
| km[len(Gn), len(Gn) + 1] = km[idx1, idx2] | |||
| km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] | |||
| km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] | |||
| ################################################################### | |||
| time_list = [] | |||
| nb_updated_list = [] | |||
| g_best = [] | |||
| dis_ks_min_list = [] | |||
| # for each alpha | |||
| for alpha in alpha_range: | |||
| print('\n-------------------------------------------------------\n') | |||
| print('alpha =', alpha) | |||
| time0 = time.time() | |||
| dhat, ghat, nb_updated = preimage_random(Gn, [g1, g2], [alpha, 1 - alpha], | |||
| range(len(Gn), len(Gn) + 2), km, | |||
| k, r_max, l, gkernel) | |||
| time_total = time.time() - time0 + time_km | |||
| print('time: ', time_total) | |||
| time_list.append(time_total) | |||
| dis_ks_min_list.append(dhat) | |||
| g_best.append(ghat) | |||
| nb_updated_list.append(nb_updated) | |||
| # show best graphs and save them to file. | |||
| for idx, item in enumerate(alpha_range): | |||
| print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) | |||
| print('one of the possible corresponding pre-images is') | |||
| nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'), | |||
| with_labels=True) | |||
| plt.show() | |||
| plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG") | |||
| plt.clf() | |||
| print(g_best[idx].nodes(data=True)) | |||
| print(g_best[idx].edges(data=True)) | |||
| # # compute the corresponding sod in graph space. (alpha range not considered.) | |||
| # sod_tmp, _ = median_distance(g_best[0], Gn_let) | |||
| # sod_gs_list.append(sod_tmp) | |||
| # sod_gs_min_list.append(np.min(sod_tmp)) | |||
| # sod_ks_min_list.append(sod_ks) | |||
| # nb_updated_list.append(nb_updated) | |||
| # print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) | |||
| print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) | |||
| print('\nnumber of updates for each alpha: ', nb_updated_list) | |||
| print('\ntimes:', time_list) | |||
| ############################################################################### | |||
| if __name__ == '__main__': | |||
| ############################################################################### | |||
| # test on the combination of the two randomly chosen graphs. (the same as in the | |||
| # random pre-image paper.) | |||
| # test_random_preimage_2combination() | |||
| ############################################################################### | |||
| # tests all algorithms on different numbers of median-sets. | |||
| test_preimage_random_median_nb() | |||
| ############################################################################### | |||
| # tests all algorithms on different values on grid of median-sets and k. | |||
| # test_preimage_random_grid_k_median_nb() | |||
| @@ -1,40 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Mar 23 09:52:50 2020 | |||
| @author: ljia | |||
| """ | |||
| import time | |||
| class Timer(object): | |||
| """A timer class that can be used by methods that support time limits. | |||
| Note | |||
| ---- | |||
| This is the Python implementation of `the C++ code in GEDLIB <https://github.com/dbblumenthal/gedlib/blob/master/src/env/timer.hpp>`__. | |||
| """ | |||
| def __init__(self, time_limit_in_sec): | |||
| """Constructs a timer for a given time limit. | |||
| Parameters | |||
| ---------- | |||
| time_limit_in_sec : string | |||
| The time limit in seconds. | |||
| """ | |||
| self.__time_limit_in_sec = time_limit_in_sec | |||
| self.__start_time = time.time() | |||
| def expired(self): | |||
| """Checks if the time limit has expired. | |||
| Return | |||
| ------ | |||
| Boolean true if the time limit has expired and false otherwise. | |||
| """ | |||
| if self.__time_limit_in_sec > 0: | |||
| runtime = time.time() - self.__start_time | |||
| return runtime >= self.__time_limit_in_sec | |||
| return False | |||
| @@ -1,151 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Oct 17 19:05:07 2019 | |||
| Useful functions. | |||
| @author: ljia | |||
| """ | |||
| #import networkx as nx | |||
| import multiprocessing | |||
| import numpy as np | |||
| from gklearn.kernels.marginalizedKernel import marginalizedkernel | |||
| from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||
| from gklearn.kernels.spKernel import spkernel | |||
| import functools | |||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel | |||
| from gklearn.kernels.structuralspKernel import structuralspkernel | |||
| from gklearn.kernels.treeletKernel import treeletkernel | |||
| from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
| def remove_edges(Gn): | |||
| for G in Gn: | |||
| for _, _, attrs in G.edges(data=True): | |||
| attrs.clear() | |||
| def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||
| term1 = Kmatrix[idx_g, idx_g] | |||
| term2 = 0 | |||
| for i, a in enumerate(alpha): | |||
| term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
| term2 *= 2 | |||
| if withterm3 == False: | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| return np.sqrt(term1 - term2 + term3) | |||
| def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): | |||
| if graph_kernel == 'marginalizedkernel': | |||
| Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| p_quit=0.03, n_iteration=10, remove_totters=False, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'untilhpathkernel': | |||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| depth=7, k_func='MinMax', compute_method='trie', | |||
| parallel=parallel, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'spkernel': | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix = np.empty((len(Gn), len(Gn))) | |||
| # Kmatrix[:] = np.nan | |||
| Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= | |||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| # for i, row in enumerate(idx): | |||
| # for j, col in enumerate(idx): | |||
| # Kmatrix[row, col] = Kmatrix_tmp[i, j] | |||
| elif graph_kernel == 'structuralspkernel': | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
| Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, | |||
| edge_label=edge_label, node_kernels=sub_kernels, | |||
| edge_kernels=sub_kernels, | |||
| parallel=parallel, n_jobs=multiprocessing.cpu_count(), | |||
| verbose=verbose) | |||
| elif graph_kernel == 'treeletkernel': | |||
| pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
| # pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| sub_kernel=pkernel, parallel=parallel, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'weisfeilerlehmankernel': | |||
| Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| height=4, base_kernel='subtree', parallel=None, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| # normalization | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| for i in range(len(Kmatrix)): | |||
| for j in range(i, len(Kmatrix)): | |||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| return Kmatrix | |||
| def gram2distances(Kmatrix): | |||
| dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||
| for i1 in range(len(Kmatrix)): | |||
| for i2 in range(len(Kmatrix)): | |||
| dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||
| dmatrix = np.sqrt(dmatrix) | |||
| return dmatrix | |||
| def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, | |||
| gkernel=None, verbose=True): | |||
| dis_mat = np.empty((len(Gn), len(Gn))) | |||
| if Kmatrix is None: | |||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | |||
| if dis < 0: | |||
| if dis > -1e-10: | |||
| dis = 0 | |||
| else: | |||
| raise ValueError('The distance is negative.') | |||
| dis_mat[i, j] = np.sqrt(dis) | |||
| dis_mat[j, i] = dis_mat[i, j] | |||
| dis_max = np.max(np.max(dis_mat)) | |||
| dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||
| dis_mean = np.mean(np.mean(dis_mat)) | |||
| return dis_mat, dis_max, dis_min, dis_mean | |||
| def get_same_item_indices(ls): | |||
| """Get the indices of the same items in a list. Return a dict keyed by items. | |||
| """ | |||
| idx_dict = {} | |||
| for idx, item in enumerate(ls): | |||
| if item in idx_dict: | |||
| idx_dict[item].append(idx) | |||
| else: | |||
| idx_dict[item] = [idx] | |||
| return idx_dict | |||
| def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | |||
| node_label=None, edge_label=None): | |||
| dis_k_all = [] # distance between g_star and each graph. | |||
| alpha = [1 / len(Gn)] * len(Gn) | |||
| if Kmatrix is None: | |||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
| dis_all.append(dtemp) | |||
| def normalize_distance_matrix(D): | |||
| max_value = np.amax(D) | |||
| min_value = np.amin(D) | |||
| return (D - min_value) / (max_value - min_value) | |||
| @@ -1,585 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Dec 19 17:16:23 2019 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| from sklearn.manifold import TSNE, Isomap | |||
| import matplotlib.pyplot as plt | |||
| from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset | |||
| from tqdm import tqdm | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL | |||
| from gklearn.preimage.utils import kernel_distance_matrix, compute_kernel, dis_gstar, get_same_item_indices | |||
| def visualize_graph_dataset(dis_measure, visual_method, draw_figure, | |||
| draw_params={}, dis_mat=None, Gn=None, | |||
| median_set=None): | |||
| def draw_zoomed_axes(Gn_embedded, ax): | |||
| margin = 0.01 | |||
| if dis_measure == 'graph-kernel': | |||
| index = -2 | |||
| elif dis_measure == 'ged': | |||
| index = -1 | |||
| x1 = np.min(Gn_embedded[median_set + [index], 0]) - margin * np.max(Gn_embedded) | |||
| x2 = np.max(Gn_embedded[median_set + [index], 0]) + margin * np.max(Gn_embedded) | |||
| y1 = np.min(Gn_embedded[median_set + [index], 1]) - margin * np.max(Gn_embedded) | |||
| y2 = np.max(Gn_embedded[median_set + [index], 1]) + margin * np.max(Gn_embedded) | |||
| if (x1 < 0 and y1 < 0) or ((x1 > 0 and y1 > 0)): | |||
| loc = 2 | |||
| else: | |||
| loc = 3 | |||
| axins = zoomed_inset_axes(ax, 4, loc=loc) # zoom-factor: 2.5, location: upper-left | |||
| draw_figure(axins, Gn_embedded, dis_measure=dis_measure, | |||
| median_set=median_set, **draw_params) | |||
| axins.set_xlim(x1, x2) # apply the x-limits | |||
| axins.set_ylim(y1, y2) # apply the y-limits | |||
| plt.yticks(visible=False) | |||
| plt.xticks(visible=False) | |||
| loc1 = 1 if loc == 2 else 3 | |||
| mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="0.5") | |||
| if dis_mat is None: | |||
| if dis_measure == 'graph-kernel': | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| Kmatrix=None, gkernel=gkernel) | |||
| elif dis_measure == 'ged': | |||
| pass | |||
| if visual_method == 'tsne': | |||
| Gn_embedded = TSNE(n_components=2, metric='precomputed').fit_transform(dis_mat) | |||
| elif visual_method == 'isomap': | |||
| Gn_embedded = Isomap(n_components=2, metric='precomputed').fit_transform(dis_mat) | |||
| print(Gn_embedded.shape) | |||
| fig, ax = plt.subplots() | |||
| draw_figure(plt, Gn_embedded, dis_measure=dis_measure, legend=True, | |||
| median_set=median_set, **draw_params) | |||
| # draw_zoomed_axes(Gn_embedded, ax) | |||
| plt.show() | |||
| plt.clf() | |||
| return | |||
| def draw_figure(ax, Gn_embedded, dis_measure=None, y_idx=None, legend=False, | |||
| median_set=None): | |||
| from matplotlib import colors as mcolors | |||
| colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)) | |||
| # colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', | |||
| # '#c6dbef', '#deebf7'] | |||
| # for i, values in enumerate(y_idx.values()): | |||
| # for item in values: | |||
| ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b') | |||
| # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b') | |||
| # ax.scatter(Gn_embedded[:,0], Gn_embedded[:,1], c='b') | |||
| h1 = ax.scatter(Gn_embedded[median_set, 0], Gn_embedded[median_set, 1], c='b') | |||
| if dis_measure == 'graph-kernel': | |||
| h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi | |||
| h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median | |||
| h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r') #c='g', marker='+') # set median | |||
| elif dis_measure == 'ged': | |||
| h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median | |||
| h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r') #c='g', marker='+') # set median | |||
| if legend: | |||
| # fig.subplots_adjust(bottom=0.17) | |||
| if dis_measure == 'graph-kernel': | |||
| ax.legend([h1, h2, h3, h4], | |||
| ['k closest graphs', 'true median', 'gen median', 'set median']) | |||
| elif dis_measure == 'ged': | |||
| ax.legend([h1, h3, h4], ['k closest graphs', 'gen median', 'set median']) | |||
| # fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
| # plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True, | |||
| # bbox_inches='tight') | |||
| # plt.show() | |||
| ############################################################################### | |||
| def visualize_distances_in_kernel(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| fname_medians = 'expert.treelet' | |||
| # add set median. | |||
| fname_sm = 'results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = 'results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute distance matrix | |||
| median_set = [22, 29, 54, 74] | |||
| gkernel = 'treeletkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| Gn_median_set = [Gn[i].copy() for i in median_set] | |||
| Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, | |||
| edge_label, True) | |||
| Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)] | |||
| dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| Kmatrix=Kmatrix, gkernel=gkernel) | |||
| print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| # add distances for the image of exact median \psi. | |||
| dis_k_median_list = [] | |||
| for idx, g in enumerate(Gn): | |||
| dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), | |||
| [1 / len(Gn_median_set)] * len(Gn_median_set), | |||
| Kmatrix_median, withterm3=False)) | |||
| dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| dis_mat_median[i, j] = dis_mat[i, j] | |||
| dis_mat_median[j, i] = dis_mat_median[i, j] | |||
| for i in range(len(Gn)): | |||
| dis_mat_median[i, -1] = dis_k_median_list[i] | |||
| dis_mat_median[-1, i] = dis_k_median_list[i] | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| # visualization. | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', Gn) | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median) | |||
| visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median, | |||
| median_set=median_set) | |||
| def visualize_distances_in_ged(): | |||
| from gklearn.preimage.fitDistance import compute_geds | |||
| from gklearn.preimage.ged import GED | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| # add set median. | |||
| fname_medians = 'expert.treelet' | |||
| fname_sm = 'preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = 'preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute/load ged matrix. | |||
| # # compute. | |||
| ## k = 4 | |||
| ## edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||
| # edit_costs = [3, 3, 1, 3, 3, 1] | |||
| ## edit_costs = [7, 3, 5, 9, 2, 6] | |||
| # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| # 'algo_options': algo_options, 'stabilizer': None, | |||
| # 'edit_cost_constant': edit_costs} | |||
| # _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True) | |||
| # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat) | |||
| # load from file. | |||
| gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm.npz') | |||
| ged_mat = gmfile['ged_mat'] | |||
| # # change medians. | |||
| # edit_costs = [3, 3, 1, 3, 3, 1] | |||
| # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| # 'algo_options': algo_options, 'stabilizer': None, | |||
| # 'edit_cost_constant': edit_costs} | |||
| # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout): | |||
| # dis, _, _ = GED(Gn[idx], set_median, **params_ged) | |||
| # ged_mat[idx, -2] = dis | |||
| # ged_mat[-2, idx] = dis | |||
| # dis, _, _ = GED(Gn[idx], gen_median, **params_ged) | |||
| # ged_mat[idx, -1] = dis | |||
| # ged_mat[-1, idx] = dis | |||
| # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', | |||
| # ged_mat=ged_mat) | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| # visualization. | |||
| median_set = [22, 29, 54, 74] | |||
| visualize_graph_dataset('ged', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=ged_mat, | |||
| median_set=median_set) | |||
| ############################################################################### | |||
| def visualize_distances_in_kernel_monoterpenoides(): | |||
| import os | |||
| ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb | |||
| Gn_original, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| # compute distance matrix | |||
| # median_set = [22, 29, 54, 74] | |||
| gkernel = 'treeletkernel' | |||
| fit_method = 'expert' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| ds_name = 'monoterpenoides' | |||
| fname_medians = fit_method + '.' + gkernel | |||
| dir_output = 'results/xp_monoterpenoides/' | |||
| repeat = 0 | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| k = len(values) | |||
| Gn = [Gn_original[g].copy() for g in values] | |||
| # add set median. | |||
| fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \ | |||
| + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \ | |||
| + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute distance matrix | |||
| median_set = range(0, len(values)) | |||
| Gn_median_set = [Gn[i].copy() for i in median_set] | |||
| Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, | |||
| edge_label, False) | |||
| Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)] | |||
| dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| Kmatrix=Kmatrix, gkernel=gkernel) | |||
| print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| # add distances for the image of exact median \psi. | |||
| dis_k_median_list = [] | |||
| for idx, g in enumerate(Gn): | |||
| dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), | |||
| [1 / len(Gn_median_set)] * len(Gn_median_set), | |||
| Kmatrix_median, withterm3=False)) | |||
| dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| dis_mat_median[i, j] = dis_mat[i, j] | |||
| dis_mat_median[j, i] = dis_mat_median[i, j] | |||
| for i in range(len(Gn)): | |||
| dis_mat_median[i, -1] = dis_k_median_list[i] | |||
| dis_mat_median[-1, i] = dis_k_median_list[i] | |||
| # visualization. | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', Gn) | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median) | |||
| visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median, | |||
| median_set=median_set) | |||
| def visualize_distances_in_ged_monoterpenoides(): | |||
| from gklearn.preimage.fitDistance import compute_geds | |||
| from gklearn.preimage.ged import GED | |||
| import os | |||
| ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb | |||
| Gn_original, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| # compute distance matrix | |||
| # median_set = [22, 29, 54, 74] | |||
| gkernel = 'treeletkernel' | |||
| fit_method = 'expert' | |||
| ds_name = 'monoterpenoides' | |||
| fname_medians = fit_method + '.' + gkernel | |||
| dir_output = 'results/xp_monoterpenoides/' | |||
| repeat = 0 | |||
| # edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||
| edit_costs = [3, 3, 1, 3, 3, 1] | |||
| # edit_costs = [7, 3, 5, 9, 2, 6] | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| k = len(values) | |||
| Gn = [Gn_original[g].copy() for g in values] | |||
| # add set median. | |||
| fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \ | |||
| + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \ | |||
| + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute/load ged matrix. | |||
| # compute. | |||
| algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'dataset': ds_name, 'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||
| 'method': 'IPFP', 'algo_options': algo_options, | |||
| 'stabilizer': None, 'edit_cost_constant': edit_costs} | |||
| _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True) | |||
| np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) \ | |||
| + '.with_medians.gm', ged_mat=ged_mat) | |||
| # # load from file. | |||
| # gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm.npz') | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # # change medians. | |||
| # algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| # 'algo_options': algo_options, 'stabilizer': None, | |||
| # 'edit_cost_constant': edit_costs} | |||
| # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout): | |||
| # dis, _, _ = GED(Gn[idx], set_median, **params_ged) | |||
| # ged_mat[idx, -2] = dis | |||
| # ged_mat[-2, idx] = dis | |||
| # dis, _, _ = GED(Gn[idx], gen_median, **params_ged) | |||
| # ged_mat[idx, -1] = dis | |||
| # ged_mat[-1, idx] = dis | |||
| # np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm', | |||
| # ged_mat=ged_mat) | |||
| # visualization. | |||
| median_set = range(0, len(values)) | |||
| visualize_graph_dataset('ged', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=ged_mat, | |||
| median_set=median_set) | |||
| ############################################################################### | |||
| def visualize_distances_in_kernel_letter_h(): | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| # Gn = Gn[0:50] | |||
| # compute distance matrix | |||
| # median_set = [22, 29, 54, 74] | |||
| gkernel = 'structuralspkernel' | |||
| fit_method = 'expert' | |||
| node_label = None | |||
| edge_label = None | |||
| ds_name = 'letter-h' | |||
| fname_medians = fit_method + '.' + gkernel | |||
| dir_output = 'results/xp_letter_h/' | |||
| k = 150 | |||
| repeat = 0 | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| Gn = [Gn_original[g].copy() for g in values] | |||
| # add set median. | |||
| fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \ | |||
| + '.y' + y + '.repeat' + str(repeat) + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \ | |||
| + '.y' + y + '.repeat' + str(repeat) + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute distance matrix | |||
| median_set = range(0, len(values)) | |||
| Gn_median_set = [Gn[i].copy() for i in median_set] | |||
| Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, | |||
| edge_label, False) | |||
| Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)] | |||
| dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| Kmatrix=Kmatrix, gkernel=gkernel) | |||
| print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2]))) | |||
| # add distances for the image of exact median \psi. | |||
| dis_k_median_list = [] | |||
| for idx, g in enumerate(Gn): | |||
| dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), | |||
| [1 / len(Gn_median_set)] * len(Gn_median_set), | |||
| Kmatrix_median, withterm3=False)) | |||
| dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1)) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| dis_mat_median[i, j] = dis_mat[i, j] | |||
| dis_mat_median[j, i] = dis_mat_median[i, j] | |||
| for i in range(len(Gn)): | |||
| dis_mat_median[i, -1] = dis_k_median_list[i] | |||
| dis_mat_median[-1, i] = dis_k_median_list[i] | |||
| # visualization. | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', Gn) | |||
| # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median) | |||
| visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median, | |||
| median_set=median_set) | |||
| def visualize_distances_in_ged_letter_h(): | |||
| from fitDistance import compute_geds | |||
| from preimage.test_k_closest_graphs import reform_attributes | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| # Gn = Gn[0:50] | |||
| # compute distance matrix | |||
| # median_set = [22, 29, 54, 74] | |||
| gkernel = 'structuralspkernel' | |||
| fit_method = 'expert' | |||
| ds_name = 'letter-h' | |||
| fname_medians = fit_method + '.' + gkernel | |||
| dir_output = 'results/xp_letter_h/' | |||
| k = 150 | |||
| repeat = 0 | |||
| # edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||
| edit_costs = [3, 3, 1, 3, 3, 1] | |||
| # edit_costs = [7, 3, 5, 9, 2, 6] | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| Gn = [Gn_original[g].copy() for g in values] | |||
| # add set median. | |||
| fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \ | |||
| + '.y' + y + '.repeat' + str(repeat) + '.gxl' | |||
| set_median = loadGXL(fname_sm) | |||
| Gn.append(set_median) | |||
| # add generalized median (estimated pre-image.) | |||
| fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \ | |||
| + '.y' + y + '.repeat' + str(repeat) + '.gxl' | |||
| gen_median = loadGXL(fname_gm) | |||
| Gn.append(gen_median) | |||
| # compute/load ged matrix. | |||
| # compute. | |||
| algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'dataset': 'Letter', 'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||
| 'method': 'IPFP', 'algo_options': algo_options, | |||
| 'stabilizer': None, 'edit_cost_constant': edit_costs} | |||
| for g in Gn: | |||
| reform_attributes(g) | |||
| _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True) | |||
| np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', ged_mat=ged_mat) | |||
| # # load from file. | |||
| # gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz') | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # # change medians. | |||
| # algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| # 'algo_options': algo_options, 'stabilizer': None, | |||
| # 'edit_cost_constant': edit_costs} | |||
| # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout): | |||
| # dis, _, _ = GED(Gn[idx], set_median, **params_ged) | |||
| # ged_mat[idx, -2] = dis | |||
| # ged_mat[-2, idx] = dis | |||
| # dis, _, _ = GED(Gn[idx], gen_median, **params_ged) | |||
| # ged_mat[idx, -1] = dis | |||
| # ged_mat[-1, idx] = dis | |||
| # np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', | |||
| # ged_mat=ged_mat) | |||
| # visualization. | |||
| median_set = range(0, len(values)) | |||
| visualize_graph_dataset('ged', 'tsne', draw_figure, | |||
| draw_params={'y_idx': y_idx}, dis_mat=ged_mat, | |||
| median_set=median_set) | |||
| if __name__ == '__main__': | |||
| visualize_distances_in_kernel_letter_h() | |||
| # visualize_distances_in_ged_letter_h() | |||
| # visualize_distances_in_kernel_monoterpenoides() | |||
| # visualize_distances_in_kernel_monoterpenoides() | |||
| # visualize_distances_in_kernel() | |||
| # visualize_distances_in_ged() | |||
| #def draw_figure_dis_k(ax, Gn_embedded, y_idx=None, legend=False): | |||
| # from matplotlib import colors as mcolors | |||
| # colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)) | |||
| ## colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', | |||
| ## '#c6dbef', '#deebf7'] | |||
| # for i, values in enumerate(y_idx.values()): | |||
| # for item in values: | |||
| ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b') | |||
| # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b') | |||
| # h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r') | |||
| # h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi | |||
| # h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median | |||
| # h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r', marker='+') # set median | |||
| # if legend: | |||
| ## fig.subplots_adjust(bottom=0.17) | |||
| # ax.legend([h1, h2, h3, h4], ['k clostest graphs', 'true median', 'gen median', 'set median']) | |||
| ## fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
| ## plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True, | |||
| ## bbox_inches='tight') | |||
| ## plt.show() | |||
| #def draw_figure_ged(ax, Gn_embedded, y_idx=None, legend=False): | |||
| # from matplotlib import colors as mcolors | |||
| # colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)) | |||
| ## colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', | |||
| ## '#c6dbef', '#deebf7'] | |||
| # for i, values in enumerate(y_idx.values()): | |||
| # for item in values: | |||
| ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b') | |||
| # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b') | |||
| # h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r') | |||
| ## h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi | |||
| # h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median | |||
| # h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r', marker='+') # set median | |||
| # if legend: | |||
| ## fig.subplots_adjust(bottom=0.17) | |||
| # ax.legend([h1, h3, h4], ['k clostest graphs', 'gen median', 'set median']) | |||
| ## fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
| ## plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True, | |||
| ## bbox_inches='tight') | |||
| ## plt.show() | |||
| @@ -1,935 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Jan 14 15:39:29 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| import csv | |||
| from shutil import copyfile | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| import os | |||
| import time | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL | |||
| from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes | |||
| from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix, compute_kernel | |||
| from gklearn.preimage.find_best_k import getRelations | |||
| def get_dataset(ds_name): | |||
| if ds_name == 'Letter-high': # node non-symb | |||
| dataset = 'cpp_ext/data/collections/Letter.xml' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' | |||
| Gn, y_all = loadDataset(dataset, extra_params=graph_dir) | |||
| for G in Gn: | |||
| reform_attributes(G, na_names=['x', 'y']) | |||
| G.graph['node_labels'] = [] | |||
| G.graph['edge_labels'] = [] | |||
| G.graph['node_attrs'] = ['x', 'y'] | |||
| G.graph['edge_attrs'] = [] | |||
| elif ds_name == 'Letter-med': # node non-symb | |||
| dataset = 'cpp_ext/data/collections/Letter.xml' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/MED/' | |||
| Gn, y_all = loadDataset(dataset, extra_params=graph_dir) | |||
| for G in Gn: | |||
| reform_attributes(G, na_names=['x', 'y']) | |||
| G.graph['node_labels'] = [] | |||
| G.graph['edge_labels'] = [] | |||
| G.graph['node_attrs'] = ['x', 'y'] | |||
| G.graph['edge_attrs'] = [] | |||
| elif ds_name == 'Letter-low': # node non-symb | |||
| dataset = 'cpp_ext/data/collections/Letter.xml' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/LOW/' | |||
| Gn, y_all = loadDataset(dataset, extra_params=graph_dir) | |||
| for G in Gn: | |||
| reform_attributes(G, na_names=['x', 'y']) | |||
| G.graph['node_labels'] = [] | |||
| G.graph['edge_labels'] = [] | |||
| G.graph['node_attrs'] = ['x', 'y'] | |||
| G.graph['edge_attrs'] = [] | |||
| elif ds_name == 'Fingerprint': | |||
| # dataset = 'cpp_ext/data/collections/Fingerprint.xml' | |||
| # graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/' | |||
| # Gn, y_all = loadDataset(dataset, extra_params=graph_dir) | |||
| # for G in Gn: | |||
| # reform_attributes(G) | |||
| dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/' | |||
| Gn, y_all = loadDataset(dataset) | |||
| elif ds_name == 'SYNTHETIC': | |||
| pass | |||
| elif ds_name == 'SYNTHETICnew': | |||
| dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/SYNTHETICnew' | |||
| # dataset = '../../datasets/Letter-high/Letter-high_A.txt' | |||
| # graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' | |||
| Gn, y_all = loadDataset(dataset) | |||
| elif ds_name == 'Synthie': | |||
| pass | |||
| elif ds_name == 'COIL-DEL': | |||
| dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
| graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/COIL-DEL/' | |||
| Gn, y_all = loadDataset(dataset) | |||
| elif ds_name == 'COIL-RAG': | |||
| pass | |||
| elif ds_name == 'COLORS-3': | |||
| pass | |||
| elif ds_name == 'FRANKENSTEIN': | |||
| pass | |||
| return Gn, y_all, graph_dir | |||
| def init_output_file(ds_name, gkernel, fit_method, dir_output): | |||
| # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | |||
| 'GED method', 'attr distance', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time', | |||
| 'median set']) | |||
| f_detail.close() | |||
| # fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', | |||
| 'GED method', 'attr distance', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time', | |||
| '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| return fn_output_detail, fn_output_summary | |||
| def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1, | |||
| Gn_data=None, k_dis_data=None, Kmatrix=None, | |||
| is_separate=False): | |||
| # 1. set parameters. | |||
| print('1. setting parameters...') | |||
| ds_name = parameters['ds_name'] | |||
| gkernel = parameters['gkernel'] | |||
| edit_cost_name = parameters['edit_cost_name'] | |||
| ged_method = parameters['ged_method'] | |||
| attr_distance = parameters['attr_distance'] | |||
| fit_method = parameters['fit_method'] | |||
| init_ecc = parameters['init_ecc'] | |||
| node_label = None | |||
| edge_label = None | |||
| dir_output = 'results/xp_fit_method/' | |||
| # 2. get dataset. | |||
| print('2. getting dataset...') | |||
| if Gn_data is None: | |||
| Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| else: | |||
| Gn = Gn_data[0] | |||
| y_all = Gn_data[1] | |||
| graph_dir = Gn_data[2] | |||
| # 3. compute kernel distance matrix. | |||
| print('3. computing kernel distance matrix...') | |||
| if k_dis_data is None: | |||
| dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, | |||
| None, Kmatrix=Kmatrix, gkernel=gkernel) | |||
| else: | |||
| # dis_mat = k_dis_data[0] | |||
| # dis_max = k_dis_data[1] | |||
| # dis_min = k_dis_data[2] | |||
| # dis_mean = k_dis_data[3] | |||
| # print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean) | |||
| pass | |||
| if save_results: | |||
| # create result files. | |||
| print('creating output files...') | |||
| fn_output_detail, fn_output_summary = init_output_file(ds_name, gkernel, | |||
| fit_method, dir_output) | |||
| # start repeats. | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [0] | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| # print('\n--------- k =', k, '----------') | |||
| sod_sm_mean_list = [] | |||
| sod_gm_mean_list = [] | |||
| dis_k_sm_mean_list = [] | |||
| dis_k_gm_mean_list = [] | |||
| dis_k_gi_min_mean_list = [] | |||
| time_fitting_mean_list = [] | |||
| time_generating_mean_list = [] | |||
| time_total_mean_list = [] | |||
| # 3. start generating and computing over targets. | |||
| print('4. starting generating and computing over targets......') | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| # y = 'I' | |||
| # values = y_idx[y] | |||
| # values = values[0:10] | |||
| print('\ny =', y) | |||
| # if y.strip() == 'A': | |||
| # continue | |||
| k = len(values) | |||
| print('\n--------- k =', k, '----------') | |||
| if k < 2: | |||
| print('\nk = ', k, ', skip.\n') | |||
| continue | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| time_fitting_list = [] | |||
| time_generating_list = [] | |||
| time_total_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| # get Gram matrix for this part of data. | |||
| if Kmatrix is not None: | |||
| if is_separate: | |||
| Kmatrix_sub = Kmatrix[i].copy() | |||
| else: | |||
| Kmatrix_sub = Kmatrix[values,:] | |||
| Kmatrix_sub = Kmatrix_sub[:,values] | |||
| else: | |||
| Kmatrix_sub = None | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
| print('median set: ', median_set_idx) | |||
| Gn_median = [Gn[g] for g in values] | |||
| # from notebooks.utils.plot_all_graphs import draw_Fingerprint_graph | |||
| # for Gn in Gn_median: | |||
| # draw_Fingerprint_graph(Gn, save=None) | |||
| # GENERATING & COMPUTING!! | |||
| res_sods, res_dis_ks, res_times = median_on_k_closest_graphs(Gn_median, | |||
| node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=graph_dir, | |||
| edit_cost_constants=None, group_min=median_set_idx_idx, | |||
| dataset=ds_name, initial_solutions=initial_solutions, | |||
| edit_cost_name=edit_cost_name, init_ecc=init_ecc, | |||
| Kmatrix=Kmatrix_sub, parallel=False) | |||
| sod_sm = res_sods[0] | |||
| sod_gm = res_sods[1] | |||
| dis_k_sm = res_dis_ks[0] | |||
| dis_k_gm = res_dis_ks[1] | |||
| dis_k_gi = res_dis_ks[2] | |||
| dis_k_gi_min = res_dis_ks[3] | |||
| idx_dis_k_gi_min = res_dis_ks[4] | |||
| time_fitting = res_times[0] | |||
| time_generating = res_times[1] | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| if save_results: | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, | |||
| edit_cost_name, ged_method, attr_distance, | |||
| fit_method, k, y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, time_fitting, time_generating, | |||
| time_fitting + time_generating, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| time_fitting_list.append(time_fitting) | |||
| time_generating_list.append(time_generating) | |||
| time_total_list.append(time_fitting + time_generating) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # save median graphs. | |||
| fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| # reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='default') | |||
| # plot median graphs. | |||
| if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': | |||
| set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
| gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
| draw_Letter_graph(set_median, fn_pre_sm_new) | |||
| draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
| draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
| # write result summary for each letter. | |||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
| time_fitting_mean_list.append(np.mean(time_fitting_list)) | |||
| time_generating_mean_list.append(np.mean(time_generating_list)) | |||
| time_total_mean_list.append(np.mean(time_total_list)) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, | |||
| edit_cost_name, ged_method, attr_distance, | |||
| fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, | |||
| time_fitting_mean_list[-1], time_generating_mean_list[-1], | |||
| time_total_mean_list[-1], nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| time_fitting_mean = np.mean(time_fitting_list) | |||
| time_generating_mean = np.mean(time_generating_list) | |||
| time_total_mean = np.mean(time_total_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, | |||
| edit_cost_name, ged_method, attr_distance, | |||
| fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, | |||
| time_fitting_mean, time_generating_mean, time_total_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph, file_prefix): | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
| nx.draw_networkx(graph, pos) | |||
| plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
| # plt.show() | |||
| plt.clf() | |||
| def compute_gm_for_each_class(Gn, y_all, gkernel, parallel='imap_unordered', is_separate=True): | |||
| if is_separate: | |||
| print('the Gram matrix is computed for each class.') | |||
| y_idx = get_same_item_indices(y_all) | |||
| Kmatrix = [] | |||
| run_time = [] | |||
| k_dis_data = [] | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('The ', str(i), ' class:') | |||
| Gn_i = [Gn[val] for val in values] | |||
| time0 = time.time() | |||
| Kmatrix.append(compute_kernel(Gn_i, gkernel, None, None, True, parallel=parallel)) | |||
| run_time.append(time.time() - time0) | |||
| k_dis_data.append(kernel_distance_matrix(Gn_i, None, None, | |||
| Kmatrix=Kmatrix[i], gkernel=gkernel, verbose=True)) | |||
| np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate) | |||
| dis_max = np.max([item[1] for item in k_dis_data]) | |||
| dis_min = np.min([item[2] for item in k_dis_data]) | |||
| dis_mean = np.mean([item[3] for item in k_dis_data]) | |||
| print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, | |||
| dis_mean) | |||
| else: | |||
| time0 = time.time() | |||
| Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel=parallel) | |||
| run_time = time.time() - time0 | |||
| np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate) | |||
| k_dis_data = kernel_distance_matrix(Gn, None, None, | |||
| Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| print('the Gram matrix is computed for the whole dataset.') | |||
| print('pair distances - dis_max, dis_min, dis_mean:', k_dis_data[1], | |||
| k_dis_data[2], k_dis_data[3]) | |||
| print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean] | |||
| return Kmatrix, run_time, k_dis_data | |||
| if __name__ == "__main__": | |||
| # #### xp 1: Letter-high, spkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-high' | |||
| # gkernel = 'spkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # # remove graphs without edges. | |||
| # Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||
| # idx = [G[0] for G in Gn] | |||
| # Gn = [G[1] for G in Gn] | |||
| # y_all = [y_all[i] for i in idx] | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # # compute pair distances. | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=None, gkernel=gkernel, verbose=True) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # # fitting and computing. | |||
| # fit_methods = ['random', 'expert', 'k-graphs'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method} | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]) | |||
| # #### xp 2: Letter-high, sspkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-high' | |||
| # gkernel = 'structuralspkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # # compute pair distances. | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=None, gkernel=gkernel, verbose=True) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # # fitting and computing. | |||
| # fit_methods = ['random', 'expert', 'k-graphs'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method} | |||
| # print('parameters: ', parameters) | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]) | |||
| # #### xp 3: SYNTHETICnew, sspkernel, using NON_SYMBOLIC. | |||
| # gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.structuralspkernel.gm.npz') | |||
| # Kmatrix = gmfile['Kmatrix'] | |||
| # run_time = gmfile['run_time'] | |||
| # # normalization | |||
| # Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| # for i in range(len(Kmatrix)): | |||
| # for j in range(i, len(Kmatrix)): | |||
| # Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| ## np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm', | |||
| ## Kmatrix=Kmatrix, run_time=run_time) | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'SYNTHETICnew' | |||
| # gkernel = 'structuralspkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # # remove graphs without nodes and edges. | |||
| # Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 | |||
| # and nx.number_of_edges(G) != 0)] | |||
| # idx = [G[0] for G in Gn] | |||
| # Gn = [G[1] for G in Gn] | |||
| # y_all = [y_all[i] for i in idx] | |||
| ## Gn = Gn[0:10] | |||
| ## y_all = y_all[0:10] | |||
| # for G in Gn: | |||
| # G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' | |||
| # # compute pair distances. | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'NON_SYMBOLIC', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method} | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=1, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # ### xp 4: SYNTHETICnew, spkernel, using NON_SYMBOLIC. | |||
| # gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm.npz') | |||
| # Kmatrix = gmfile['Kmatrix'] | |||
| # # normalization | |||
| # Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| # for i in range(len(Kmatrix)): | |||
| # for j in range(i, len(Kmatrix)): | |||
| # Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # run_time = 21821.35 | |||
| # np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm', | |||
| # Kmatrix=Kmatrix, run_time=run_time) | |||
| # | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'SYNTHETICnew' | |||
| # gkernel = 'spkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| ## # remove graphs without nodes and edges. | |||
| ## Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_node(G) != 0 | |||
| ## and nx.number_of_edges(G) != 0)] | |||
| ## idx = [G[0] for G in Gn] | |||
| ## Gn = [G[1] for G in Gn] | |||
| ## y_all = [y_all[i] for i in idx] | |||
| ## Gn = Gn[0:5] | |||
| ## y_all = y_all[0:5] | |||
| # for G in Gn: | |||
| # G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| ## Kmatrix = compute_kernel(Gn, gkernel, None, None, True) | |||
| ## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| ## Kmatrix=Kmatrix) | |||
| # gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| # Kmatrix = gmfile['Kmatrix'] | |||
| # run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| # print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'NON_SYMBOLIC', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method} | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=1, | |||
| # Gn_data=[Gn, y_all, graph_dir], | |||
| # k_dis_data=[dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # #### xp 5: Fingerprint, sspkernel, using LETTER2, only node attrs. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Fingerprint' | |||
| # gkernel = 'structuralspkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # # remove graphs without nodes and edges. | |||
| # Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0] | |||
| ## and nx.number_of_edges(G) != 0)] | |||
| # idx = [G[0] for G in Gn] | |||
| # Gn = [G[1] for G in Gn] | |||
| # y_all = [y_all[i] for i in idx] | |||
| # y_idx = get_same_item_indices(y_all) | |||
| # # remove unused labels. | |||
| # for G in Gn: | |||
| # G.graph['edge_attrs'] = [] | |||
| # for edge in G.edges: | |||
| # del G.edges[edge]['attributes'] | |||
| # del G.edges[edge]['orient'] | |||
| # del G.edges[edge]['angle'] | |||
| ## Gn = Gn[805:815] | |||
| ## y_all = y_all[805:815] | |||
| # for G in Gn: | |||
| # G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| ## Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| ## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| ## Kmatrix=Kmatrix) | |||
| # gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| # Kmatrix = gmfile['Kmatrix'] | |||
| ## run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| ## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method, | |||
| # 'init_ecc': [1,1,1,1,1]} # [0.525, 0.525, 0.001, 0.125, 0.125]} | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # #### xp 6: Letter-med, sspkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-med' | |||
| # gkernel = 'structuralspkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| # Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| # np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| # Kmatrix=Kmatrix) | |||
| ## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| ## Kmatrix = gmfile['Kmatrix'] | |||
| ## run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| ## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method, | |||
| # 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]} | |||
| # print('parameters: ', parameters) | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # #### xp 7: Letter-low, sspkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-low' | |||
| # gkernel = 'structuralspkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| # Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| # np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| # Kmatrix=Kmatrix) | |||
| ## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| ## Kmatrix = gmfile['Kmatrix'] | |||
| ## run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| ## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method, | |||
| # 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]} | |||
| # print('parameters: ', parameters) | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # #### xp 8: Letter-med, spkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-med' | |||
| # gkernel = 'spkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # # remove graphs without nodes and edges. | |||
| # Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 | |||
| # and nx.number_of_edges(G) != 0)] | |||
| # idx = [G[0] for G in Gn] | |||
| # Gn = [G[1] for G in Gn] | |||
| # y_all = [y_all[i] for i in idx] | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| # Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| # np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| # Kmatrix=Kmatrix) | |||
| ## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| ## Kmatrix = gmfile['Kmatrix'] | |||
| ## run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| ## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method, | |||
| # 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]} | |||
| # print('parameters: ', parameters) | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| # #### xp 9: Letter-low, spkernel. | |||
| # # load dataset. | |||
| # print('getting dataset and computing kernel distance matrix first...') | |||
| # ds_name = 'Letter-low' | |||
| # gkernel = 'spkernel' | |||
| # Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # # remove graphs without nodes and edges. | |||
| # Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 | |||
| # and nx.number_of_edges(G) != 0)] | |||
| # idx = [G[0] for G in Gn] | |||
| # Gn = [G[1] for G in Gn] | |||
| # y_all = [y_all[i] for i in idx] | |||
| ## Gn = Gn[0:50] | |||
| ## y_all = y_all[0:50] | |||
| # | |||
| # # compute/read Gram matrix and pair distances. | |||
| # Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| # np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| # Kmatrix=Kmatrix) | |||
| ## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| ## Kmatrix = gmfile['Kmatrix'] | |||
| ## run_time = gmfile['run_time'] | |||
| ## Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| ## Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| ## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| ## Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # | |||
| # # fitting and computing. | |||
| # fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] | |||
| # for fit_method in fit_methods: | |||
| # print('\n-------------------------------------') | |||
| # print('fit method:', fit_method) | |||
| # parameters = {'ds_name': ds_name, | |||
| # 'gkernel': gkernel, | |||
| # 'edit_cost_name': 'LETTER2', | |||
| # 'ged_method': 'mIPFP', | |||
| # 'attr_distance': 'euclidean', | |||
| # 'fit_method': fit_method, | |||
| # 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]} | |||
| # print('parameters: ', parameters) | |||
| # xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| # initial_solutions=40, | |||
| # Gn_data = [Gn, y_all, graph_dir], | |||
| # k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], | |||
| # Kmatrix=Kmatrix) | |||
| #### xp 5: COIL-DEL, sspkernel, using LETTER2, only node attrs. | |||
| # load dataset. | |||
| print('getting dataset and computing kernel distance matrix first...') | |||
| ds_name = 'COIL-DEL' | |||
| gkernel = 'structuralspkernel' | |||
| Gn, y_all, graph_dir = get_dataset(ds_name) | |||
| # remove graphs without nodes and edges. | |||
| Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0] | |||
| # and nx.number_of_edges(G) != 0)] | |||
| idx = [G[0] for G in Gn] | |||
| Gn = [G[1] for G in Gn] | |||
| y_all = [y_all[i] for i in idx] | |||
| # remove unused labels. | |||
| for G in Gn: | |||
| G.graph['edge_labels'] = [] | |||
| for edge in G.edges: | |||
| del G.edges[edge]['bond_type'] | |||
| del G.edges[edge]['valence'] | |||
| # Gn = Gn[805:815] | |||
| # y_all = y_all[805:815] | |||
| for G in Gn: | |||
| G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' | |||
| # compute/read Gram matrix and pair distances. | |||
| is_separate = True | |||
| Kmatrix, run_time, k_dis_data = compute_gm_for_each_class(Gn, | |||
| y_all, | |||
| gkernel, | |||
| parallel='imap_unordered', | |||
| is_separate=is_separate) | |||
| # Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') | |||
| # np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', | |||
| # Kmatrix=Kmatrix) | |||
| # gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') | |||
| # Kmatrix = gmfile['Kmatrix'] | |||
| # run_time = gmfile['run_time'] | |||
| # Kmatrix = Kmatrix[[0,1,2,3,4],:] | |||
| # Kmatrix = Kmatrix[:,[0,1,2,3,4]] | |||
| # print('\nTime to compute Gram matrix for the whole dataset: ', run_time) | |||
| # dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, | |||
| # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) | |||
| # Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| # dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 | |||
| # fitting and computing. | |||
| fit_methods = ['k-graphs', 'random', 'random', 'random'] | |||
| for fit_method in fit_methods: | |||
| print('\n-------------------------------------') | |||
| print('fit method:', fit_method) | |||
| parameters = {'ds_name': ds_name, | |||
| 'gkernel': gkernel, | |||
| 'edit_cost_name': 'LETTER2', | |||
| 'ged_method': 'mIPFP', | |||
| 'attr_distance': 'euclidean', | |||
| 'fit_method': fit_method, | |||
| 'init_ecc': [3,3,1,3,3]} # [0.525, 0.525, 0.001, 0.125, 0.125]} | |||
| xp_fit_method_for_non_symbolic(parameters, save_results=True, | |||
| initial_solutions=40, | |||
| Gn_data=[Gn, y_all, graph_dir], | |||
| k_dis_data=k_dis_data, | |||
| Kmatrix=Kmatrix, | |||
| is_separate=is_separate) | |||
| @@ -1,476 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Jan 14 15:39:29 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| import csv | |||
| from shutil import copyfile | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL | |||
| from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes | |||
| from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix | |||
| from gklearn.preimage.find_best_k import getRelations | |||
| def xp_letter_h_LETTER2_cost(): | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, Kmatrix=None, gkernel='structuralspkernel') | |||
| for G in Gn: | |||
| reform_attributes(G) | |||
| # ds = {'name': 'Letter-high', | |||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'structuralspkernel' | |||
| node_label = None | |||
| edge_label = None | |||
| ds_name = 'letter-h' | |||
| dir_output = 'results/xp_letter_h/' | |||
| save_results = True | |||
| cost = 'LETTER2' | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [150] | |||
| fit_method = 'k-graphs' | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| if save_results: | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| print('\n--------- k =', k, '----------') | |||
| sod_sm_mean_list = [] | |||
| sod_gm_mean_list = [] | |||
| dis_k_sm_mean_list = [] | |||
| dis_k_gm_mean_list = [] | |||
| dis_k_gi_min_mean_list = [] | |||
| # nb_sod_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_gi2sm = [0, 0, 0] | |||
| # nb_dis_k_gi2gm = [0, 0, 0] | |||
| # repeats_better_sod_sm2gm = [] | |||
| # repeats_better_dis_k_sm2gm = [] | |||
| # repeats_better_dis_k_gi2sm = [] | |||
| # repeats_better_dis_k_gi2gm = [] | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| # y = 'F' | |||
| # values = y_idx[y] | |||
| # values = values[0:10] | |||
| k = len(values) | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
| print('median set: ', median_set_idx) | |||
| Gn_median = [Gn[g] for g in values] | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
| edit_costs=None, group_min=median_set_idx_idx, | |||
| dataset='Letter', cost=cost, parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| if save_results: | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # save median graphs. | |||
| fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||
| # plot median graphs. | |||
| set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
| gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
| draw_Letter_graph(set_median, fn_pre_sm_new) | |||
| draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
| draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
| # write result summary for each letter. | |||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| def xp_letter_h(): | |||
| ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| for G in Gn: | |||
| reform_attributes(G) | |||
| # ds = {'name': 'Letter-high', | |||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'structuralspkernel' | |||
| node_label = None | |||
| edge_label = None | |||
| ds_name = 'letter-h' | |||
| dir_output = 'results/xp_letter_h/' | |||
| save_results = False | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [150] | |||
| fit_method = 'k-graphs' | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| if save_results: | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| print('\n--------- k =', k, '----------') | |||
| sod_sm_mean_list = [] | |||
| sod_gm_mean_list = [] | |||
| dis_k_sm_mean_list = [] | |||
| dis_k_gm_mean_list = [] | |||
| dis_k_gi_min_mean_list = [] | |||
| # nb_sod_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_gi2sm = [0, 0, 0] | |||
| # nb_dis_k_gi2gm = [0, 0, 0] | |||
| # repeats_better_sod_sm2gm = [] | |||
| # repeats_better_dis_k_sm2gm = [] | |||
| # repeats_better_dis_k_gi2sm = [] | |||
| # repeats_better_dis_k_gi2gm = [] | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| # y = 'N' | |||
| # values = y_idx[y] | |||
| # values = values[0:10] | |||
| k = len(values) | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
| print('median set: ', median_set_idx) | |||
| Gn_median = [Gn[g] for g in values] | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
| edit_costs=None, group_min=median_set_idx_idx, | |||
| dataset='Letter', parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| if save_results: | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # save median graphs. | |||
| fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||
| # plot median graphs. | |||
| set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
| gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
| draw_Letter_graph(set_median, fn_pre_sm_new) | |||
| draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
| draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
| # write result summary for each letter. | |||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph, file_prefix): | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
| nx.draw_networkx(graph, pos) | |||
| plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
| # plt.show() | |||
| plt.clf() | |||
| if __name__ == "__main__": | |||
| # xp_letter_h() | |||
| xp_letter_h_LETTER2_cost() | |||
| @@ -1,249 +0,0 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Jan 16 11:03:11 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| import csv | |||
| from shutil import copyfile | |||
| import networkx as nx | |||
| import matplotlib.pyplot as plt | |||
| from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL | |||
| from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes | |||
| from gklearn.preimage.utils import get_same_item_indices | |||
| from gklearn.preimage.find_best_k import getRelations | |||
| def xp_monoterpenoides(): | |||
| import os | |||
| ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds', | |||
| 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # ds = {'name': 'Letter-high', | |||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'treeletkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| ds_name = 'monoterpenoides' | |||
| dir_output = 'results/xp_monoterpenoides/' | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [0] | |||
| fit_method = 'k-graphs' | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| print('\n--------- k =', k, '----------') | |||
| sod_sm_mean_list = [] | |||
| sod_gm_mean_list = [] | |||
| dis_k_sm_mean_list = [] | |||
| dis_k_gm_mean_list = [] | |||
| dis_k_gi_min_mean_list = [] | |||
| # nb_sod_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_gi2sm = [0, 0, 0] | |||
| # nb_dis_k_gi2gm = [0, 0, 0] | |||
| # repeats_better_sod_sm2gm = [] | |||
| # repeats_better_dis_k_sm2gm = [] | |||
| # repeats_better_dis_k_gi2sm = [] | |||
| # repeats_better_dis_k_gi2gm = [] | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| # y = 'I' | |||
| # values = y_idx[y] | |||
| k = len(values) | |||
| # k = kkk | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
| print('median set: ', median_set_idx) | |||
| Gn_median = [Gn[g] for g in values] | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
| edit_costs=None, group_min=median_set_idx_idx, | |||
| dataset=ds_name, parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # save median graphs. | |||
| fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| # reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib') | |||
| # # plot median graphs. | |||
| # set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
| # gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
| # draw_Letter_graph(set_median, fn_pre_sm_new) | |||
| # draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
| # draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
| # write result summary for each letter. | |||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| #Dessin median courrant | |||
| def draw_Letter_graph(graph, file_prefix): | |||
| plt.figure() | |||
| pos = {} | |||
| for n in graph.nodes: | |||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
| nx.draw_networkx(graph, pos) | |||
| plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
| # plt.show() | |||
| plt.clf() | |||
| if __name__ == "__main__": | |||
| xp_monoterpenoides() | |||