| @@ -0,0 +1,312 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Tue Nov 6 15:35:32 2018 | |||||
| @author: ljia | |||||
| """ | |||||
| #import numpy as np | |||||
| import matplotlib.pyplot as plt | |||||
| import numpy as np | |||||
| import matplotlib.gridspec as gridspec | |||||
| # import pickle | |||||
| import os | |||||
| import sys | |||||
| from tqdm import tqdm | |||||
| # from mpl_toolkits.mplot3d import Axes3D | |||||
| root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/' | |||||
| root_dir_criann = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/CRIANN/' | |||||
| Dataset_List = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| Legend_Labels = ['common walk', 'marginalized', 'Sylvester equation', 'conjugate gradient', 'fixed-point iterations', 'Spectral decomposition', 'shortest path', 'structural sp', 'path up to length $h$', 'treelet', 'WL subtree'] | |||||
| # Colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', | |||||
| # '#54278f', '#756bb1', '#9e9ac8', '#de2d26', '#fc9272'] | |||||
| Colors=[ | |||||
| '#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', | |||||
| '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', | |||||
| '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', | |||||
| '#17becf', '#9edae5'] | |||||
| SMALL_SIZE = 8 | |||||
| MEDIUM_SIZE = 10 | |||||
| BIGGER_SIZE = 12 | |||||
| def read_trials_group(save_dir, ds_name, num_sols, ratio, label): | |||||
| file_name = save_dir + 'groups/ged_mats.' + ds_name + '.' + label + '_' + str(num_sols) + '.ratio_' + "{:.2f}".format(ratio) + '.npy' | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| ged_mats = np.load(f) | |||||
| return ged_mats | |||||
| else: | |||||
| return [] | |||||
| # ged_mats = [] | |||||
| # for trial in range(1, 101): | |||||
| # file_name = file_prefix + '.trial_' + str(trial) + '.pkl' | |||||
| # if os.path.isfile(file_name): | |||||
| # ged_matrix = pickle.load(open(file_name, 'rb')) | |||||
| # ged_mats.append(ged_matrix) | |||||
| # else: | |||||
| # # print(trial) | |||||
| # pass | |||||
| # Check average relative error along elements in two ged matrices. | |||||
| def matrices_ave_relative_error(m1, m2): | |||||
| error = 0 | |||||
| base = 0 | |||||
| for i in range(m1.shape[0]): | |||||
| for j in range(m1.shape[1]): | |||||
| error += np.abs(m1[i, j] - m2[i, j]) | |||||
| base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||||
| return error / base | |||||
| def compute_relative_error(ged_mats): | |||||
| if len(ged_mats) != 0: | |||||
| # get the smallest "correct" GED matrix. | |||||
| ged_mat_s = np.ones(ged_mats[0].shape) * np.inf | |||||
| for i in range(ged_mats[0].shape[0]): | |||||
| for j in range(ged_mats[0].shape[1]): | |||||
| ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats]) | |||||
| # compute average error. | |||||
| errors = [] | |||||
| for i, mat in enumerate(ged_mats): | |||||
| err = matrices_ave_relative_error(mat, ged_mat_s) | |||||
| # if not per_correct: | |||||
| # print('matrix # ', str(i)) | |||||
| # pass | |||||
| errors.append(err) | |||||
| else: | |||||
| errors = [0] | |||||
| return np.mean(errors) | |||||
| #plt.rc('font', size=SMALL_SIZE) # controls default text sizes | |||||
| plt.rc('axes', titlesize=15) # fontsize of the axes title | |||||
| plt.rc('axes', labelsize=15) # fontsize of the x and y labels | |||||
| plt.rc('xtick', labelsize=15) # fontsize of the tick labels | |||||
| plt.rc('ytick', labelsize=15) # fontsize of the tick labels | |||||
| plt.rc('legend', fontsize=15) # legend fontsize | |||||
| plt.rc('figure', titlesize=15) # fontsize of the figure title | |||||
| #fig, _ = plt.subplots(2, 2, figsize=(13, 12)) | |||||
| #ax1 = plt.subplot(221) | |||||
| #ax2 = plt.subplot(222) | |||||
| #ax3 = plt.subplot(223) | |||||
| #ax4 = plt.subplot(224) | |||||
| gs = gridspec.GridSpec(2, 2) | |||||
| gs.update(hspace=0.3) | |||||
| fig = plt.figure(figsize=(11, 12)) | |||||
| ax = fig.add_subplot(111) # The big subplot for common labels | |||||
| ax1 = fig.add_subplot(gs[0, 0], projection='3d') | |||||
| ax2 = fig.add_subplot(gs[0, 1], projection='3d') | |||||
| ax3 = fig.add_subplot(gs[1, 0], projection='3d') | |||||
| ax4 = fig.add_subplot(gs[1, 1], projection='3d') | |||||
| # ax5 = fig.add_subplot(gs[2, 0]) | |||||
| # ax6 = fig.add_subplot(gs[2, 1]) | |||||
| # Turn off axis lines and ticks of the big subplot | |||||
| ax.spines['top'].set_color('none') | |||||
| ax.spines['bottom'].set_color('none') | |||||
| ax.spines['left'].set_color('none') | |||||
| ax.spines['right'].set_color('none') | |||||
| ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') | |||||
| ax.xaxis.set_ticks_position('none') | |||||
| ax.yaxis.set_ticks_position('none') | |||||
| # Set common labels | |||||
| #ax.set_xlabel('accuracy(%)') | |||||
| ax.yaxis.set_label_coords(-0.105, 0.5) | |||||
| # ax.set_ylabel('runtime($s$)') | |||||
| # -------------- num_sols, IPFP -------------- | |||||
| def get_num_sol_results(): | |||||
| save_dir = root_dir_criann + 'edit_costs.num_sols.ratios.IPFP/' | |||||
| errors = {} | |||||
| print('-------- num_sols, IPFP --------') | |||||
| for ds_name in Dataset_List: | |||||
| print(ds_name) | |||||
| errors[ds_name] = [] | |||||
| for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
| errors[ds_name].append([]) | |||||
| for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||||
| ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'num_sols') | |||||
| error = compute_relative_error(ged_mats) | |||||
| errors[ds_name][-1].append(error) | |||||
| return errors | |||||
| x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||||
| y_values = range(0, 19) | |||||
| X, Y = np.meshgrid(x_values, y_values) | |||||
| errors = get_num_sol_results() | |||||
| for i, ds_name in enumerate(Dataset_List): | |||||
| if ds_name in errors: | |||||
| z_values = np.array(errors[ds_name]) | |||||
| ax1.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
| # ax1.set_yscale('squareroot') | |||||
| # ax1.grid(axis='y') | |||||
| ax1.set_xlabel('# of solutions') | |||||
| ax1.set_ylabel('ratios') | |||||
| ax1.set_zlabel('average relative errors (%)') | |||||
| ax1.set_title('(a) num_sols, IPFP') | |||||
| ax1.set_yticks(range(0, 19, 2)) | |||||
| ax1.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||||
| # ax1.set_axisbelow(True) | |||||
| # ax1.spines['top'].set_visible(False) | |||||
| # ax1.spines['bottom'].set_visible(False) | |||||
| # ax1.spines['right'].set_visible(False) | |||||
| # ax1.spines['left'].set_visible(False) | |||||
| # ax1.xaxis.set_ticks_position('none') | |||||
| # ax1.yaxis.set_ticks_position('none') | |||||
| # ax1.set_ylim(bottom=-1000) | |||||
| handles, labels = ax1.get_legend_handles_labels() | |||||
| # # -------------- repeats, IPFP -------------- | |||||
| def get_repeats_results(): | |||||
| save_dir = root_dir_criann + 'edit_costs.repeats.ratios.IPFP/' | |||||
| errors = {} | |||||
| print('-------- repeats, IPFP --------') | |||||
| for ds_name in Dataset_List: | |||||
| print(ds_name) | |||||
| errors[ds_name] = [] | |||||
| for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
| errors[ds_name].append([]) | |||||
| for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||||
| ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'repeats') | |||||
| error = compute_relative_error(ged_mats) | |||||
| errors[ds_name][-1].append(error) | |||||
| return errors | |||||
| x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||||
| y_values = range(0, 19) | |||||
| X, Y = np.meshgrid(x_values, y_values) | |||||
| errors = get_repeats_results() | |||||
| for i, ds_name in enumerate(Dataset_List): | |||||
| if ds_name in errors: | |||||
| z_values = np.array(errors[ds_name]) | |||||
| ax2.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
| # ax2.set_yscale('squareroot') | |||||
| # ax2.grid(axis='y') | |||||
| ax2.set_xlabel('# of solutions') | |||||
| ax2.set_ylabel('ratios') | |||||
| ax2.set_zlabel('average relative errors (%)') | |||||
| ax2.set_title('(b) repeats, IPFP') | |||||
| ax2.set_yticks(range(0, 19, 2)) | |||||
| ax2.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||||
| # ax2.set_axisbelow(True) | |||||
| # ax2.spines['top'].set_visible(False) | |||||
| # ax2.spines['bottom'].set_visible(False) | |||||
| # ax2.spines['right'].set_visible(False) | |||||
| # ax2.spines['left'].set_visible(False) | |||||
| # ax2.xaxis.set_ticks_position('none') | |||||
| # ax2.yaxis.set_ticks_position('none') | |||||
| # ax2.set_ylim(bottom=-1000) | |||||
| handles, labels = ax2.get_legend_handles_labels() | |||||
| # # -------------- degrees -------------- | |||||
| # def get_degree_results(): | |||||
| # save_dir = root_dir_criann + '28 cores/synthesized_graphs_degrees/' | |||||
| # run_times = {} | |||||
| # for kernel_name in Graph_Kernel_List: | |||||
| # run_times[kernel_name] = [] | |||||
| # for num in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||||
| # file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||||
| # if os.path.isfile(file_name): | |||||
| # run_time = pickle.load(open(file_name, 'rb')) | |||||
| # else: | |||||
| # run_time = 0 | |||||
| # run_times[kernel_name].append(run_time) | |||||
| # return run_times | |||||
| # x_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] | |||||
| # run_times = get_degree_results() | |||||
| # for i, kernel_name in enumerate(Graph_Kernel_List): | |||||
| # if kernel_name in run_times: | |||||
| # ax3.plot(x_labels, run_times[kernel_name], '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
| # ax3.set_yscale('log', nonposy='clip') | |||||
| # ax3.grid(axis='y') | |||||
| # ax3.set_xlabel('degrees') | |||||
| # ax3.set_ylabel('runtime($s$)') | |||||
| # #ax3.set_ylabel('runtime($s$) per pair of graphs') | |||||
| # ax3.set_title('(c) degrees') | |||||
| # ax3.set_axisbelow(True) | |||||
| # ax3.spines['top'].set_visible(False) | |||||
| # ax3.spines['bottom'].set_visible(False) | |||||
| # ax3.spines['right'].set_visible(False) | |||||
| # ax3.spines['left'].set_visible(False) | |||||
| # ax3.xaxis.set_ticks_position('none') | |||||
| # ax3.yaxis.set_ticks_position('none') | |||||
| # # -------------- Node labels -------------- | |||||
| # def get_node_label_results(): | |||||
| # save_dir = root_dir_criann + '28 cores/synthesized_graphs_num_node_label_alphabet/' | |||||
| # run_times = {} | |||||
| # for kernel_name in Graph_Kernel_List_VSym: | |||||
| # run_times[kernel_name] = [] | |||||
| # for num in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]: | |||||
| # file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||||
| # if os.path.isfile(file_name): | |||||
| # run_time = pickle.load(open(file_name, 'rb')) | |||||
| # else: | |||||
| # run_time = 0 | |||||
| # run_times[kernel_name].append(run_time) | |||||
| # return run_times | |||||
| # # save_dir = root_dir_criann + 'synthesized_graphs_num_node_label_alphabet/' | |||||
| # # run_times = pickle.load(open(save_dir + 'run_times.pkl', 'rb')) | |||||
| # # return run_times | |||||
| # x_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20] | |||||
| # run_times = get_node_label_results() | |||||
| # for i, kernel_name in enumerate(Graph_Kernel_List): | |||||
| # if kernel_name in run_times: | |||||
| # ax4.plot(x_labels[1:], run_times[kernel_name][1:], '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
| # ax4.set_yscale('log', nonposy='clip') | |||||
| # ax4.grid(axis='y') | |||||
| # ax4.set_xlabel('# of alphabets') | |||||
| # ax4.set_ylabel('runtime($s$)') | |||||
| # #ax4.set_ylabel('runtime($s$) per pair of graphs') | |||||
| # ax4.set_title('(d) alphabet size of vertex labels') | |||||
| # ax4.set_axisbelow(True) | |||||
| # ax4.spines['top'].set_visible(False) | |||||
| # ax4.spines['bottom'].set_visible(False) | |||||
| # ax4.spines['right'].set_visible(False) | |||||
| # ax4.spines['left'].set_visible(False) | |||||
| # ax4.xaxis.set_ticks_position('none') | |||||
| # ax4.yaxis.set_ticks_position('none') | |||||
| from matplotlib.lines import Line2D | |||||
| custom_lines = [] | |||||
| for color in Colors: | |||||
| custom_lines.append(Line2D([0], [0], color=color, lw=4)) | |||||
| fig.subplots_adjust(bottom=0.135) | |||||
| fig.legend(custom_lines, labels, loc='lower center', ncol=4, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||||
| plt.savefig('stability.real_data.relative_error.eps', format='eps', dpi=300, transparent=True, | |||||
| bbox_inches='tight') | |||||
| plt.show() | |||||
| @@ -0,0 +1,130 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Nov 2 16:17:01 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import numpy as np | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': max_num_solutions, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
| np.save(f, np.array(ged_mats)) | |||||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
| print() | |||||
| print('Max # of solutions:', max_num_solutions) | |||||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| if not os.path.exists(save_dir + 'groups/'): | |||||
| os.makedirs(save_dir + 'groups/') | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -11,41 +11,16 @@ import os | |||||
| import multiprocessing | import multiprocessing | ||||
| import pickle | import pickle | ||||
| import logging | import logging | ||||
| from gklearn.utils import Dataset | |||||
| from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
| import numpy as np | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| def get_dataset(ds_name): | |||||
| # The node/edge labels that will not be used in the computation. | |||||
| if ds_name == 'MAO': | |||||
| irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
| elif ds_name == 'Monoterpenoides': | |||||
| irrelevant_labels = {'edge_labels': ['valence']} | |||||
| elif ds_name == 'MUTAG': | |||||
| irrelevant_labels = {'edge_labels': ['label_0']} | |||||
| elif ds_name == 'AIDS_symb': | |||||
| irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||||
| def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
| # Initialize a Dataset. | |||||
| dataset = Dataset() | |||||
| # Load predefined dataset. | |||||
| dataset.load_predefined_dataset(ds_name) | |||||
| # Remove irrelevant labels. | |||||
| dataset.remove_labels(**irrelevant_labels) | |||||
| print('dataset size:', len(dataset.graphs)) | |||||
| return dataset | |||||
| def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||||
| save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| """**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
| @@ -77,31 +52,71 @@ def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||||
| """**5. Compute GED matrix.**""" | """**5. Compute GED matrix.**""" | ||||
| ged_mat = 'error' | ged_mat = 'error' | ||||
| runtime = 0 | |||||
| try: | try: | ||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | ||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | except Exception as exp: | ||||
| print('An exception occured when running this experiment:') | print('An exception occured when running this experiment:') | ||||
| LOG_FILENAME = save_dir + 'error.txt' | LOG_FILENAME = save_dir + 'error.txt' | ||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | ||||
| logging.exception('save_file_suffix') | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | print(repr(exp)) | ||||
| """**6. Get results.**""" | """**6. Get results.**""" | ||||
| pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb')) | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
| np.save(f, np.array(ged_mats)) | |||||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
| print() | |||||
| print('# of solutions:', num_solutions) | |||||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']: | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| if not os.path.exists(save_dir + 'groups/'): | |||||
| os.makedirs(save_dir + 'groups/') | |||||
| for ds_name in ds_name_list: | |||||
| print() | print() | ||||
| print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
| for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
| print() | |||||
| print('# of solutions:', num_solutions) | |||||
| for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -0,0 +1,125 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Wed Oct 20 17:48:02 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import numpy as np | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
| 'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
| # when bigger than 1, then the method is considered mIPFP. | |||||
| 'initial_solutions': 1, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'ratio_runs_from_initial_solutions': 1, | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
| np.save(f, np.array(ged_mats)) | |||||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||||
| print() | |||||
| print('Repeats:', repeats) | |||||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, repeats, ratio) | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| if not os.path.exists(save_dir + 'groups/'): | |||||
| os.makedirs(save_dir + 'groups/') | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -0,0 +1,130 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Wed Oct 20 17:48:02 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
| import os | |||||
| import multiprocessing | |||||
| import pickle | |||||
| import logging | |||||
| from gklearn.ged.util import compute_geds | |||||
| import numpy as np | |||||
| import time | |||||
| from utils import get_dataset | |||||
| import sys | |||||
| def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| """**2. Set parameters.**""" | |||||
| # Parameters for GED computation. | |||||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
| 'lsape_model': 'ECBP', # | |||||
| # ??when bigger than 1, then the method is considered mIPFP. | |||||
| # the actual number of computed solutions might be smaller than the specified value | |||||
| 'max_num_solutions': 1, | |||||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
| 'greedy_method': 'BASIC', # | |||||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
| 'attr_distance': 'euclidean', | |||||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
| 'threads': multiprocessing.cpu_count(), | |||||
| 'centrality_method': 'NONE', | |||||
| 'centrality_weight': 0.7, | |||||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
| } | |||||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
| options = ged_options.copy() | |||||
| options['edit_cost_constants'] = edit_cost_constants | |||||
| options['node_labels'] = dataset.node_labels | |||||
| options['edge_labels'] = dataset.edge_labels | |||||
| options['node_attrs'] = dataset.node_attrs | |||||
| options['edge_attrs'] = dataset.edge_attrs | |||||
| parallel = True # if num_solutions == 1 else False | |||||
| """**5. Compute GED matrix.**""" | |||||
| ged_mat = 'error' | |||||
| runtime = 0 | |||||
| try: | |||||
| time0 = time.time() | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
| runtime = time.time() - time0 | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| """**6. Get results.**""" | |||||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(ged_mat, f) | |||||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| return ged_mat, runtime | |||||
| def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
| ged_mats = [] | |||||
| runtimes = [] | |||||
| for trial in range(1, 101): | |||||
| print() | |||||
| print('Trial:', trial) | |||||
| ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||||
| ged_mats.append(ged_mat) | |||||
| runtimes.append(runtime) | |||||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
| np.save(f, np.array(ged_mats)) | |||||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
| pickle.dump(runtime, f) | |||||
| def results_for_a_dataset(ds_name): | |||||
| """**1. Get dataset.**""" | |||||
| dataset = get_dataset(ds_name) | |||||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||||
| print() | |||||
| print('Repeats:', repeats) | |||||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
| print() | |||||
| print('Ratio:', ratio) | |||||
| save_trials_as_group(dataset, ds_name, repeats, ratio) | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| ds_name_list = sys.argv[1:] | |||||
| else: | |||||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
| save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| if not os.path.exists(save_dir + 'groups/'): | |||||
| os.makedirs(save_dir + 'groups/') | |||||
| for ds_name in ds_name_list: | |||||
| print() | |||||
| print('Dataset:', ds_name) | |||||
| results_for_a_dataset(ds_name) | |||||
| @@ -0,0 +1,108 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Thu Oct 29 17:26:43 2020 | |||||
| @author: ljia | |||||
| This script groups results together into a single file for the sake of faster | |||||
| searching and loading. | |||||
| """ | |||||
| import os | |||||
| import pickle | |||||
| import numpy as np | |||||
| from shutil import copyfile | |||||
| from tqdm import tqdm | |||||
| import sys | |||||
| def group_trials(dir_folder, name_prefix, override, clear, backup): | |||||
| # Get group name. | |||||
| label_name = name_prefix.split('.')[0] | |||||
| if label_name == 'ged_matrix': | |||||
| group_label = 'ged_mats' | |||||
| elif label_name == 'runtime': | |||||
| group_label = 'runtimes' | |||||
| else: | |||||
| group_label = label_name | |||||
| name_suffix = name_prefix[len(label_name):] | |||||
| if label_name == 'ged_matrix': | |||||
| name_group = dir_folder + 'groups/' + group_label + name_suffix + 'npy' | |||||
| else: | |||||
| name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' | |||||
| if not override and os.path.isfile(name_group): | |||||
| # Check if all trial files exist. | |||||
| trials_complete = True | |||||
| for trial in range(1, 101): | |||||
| file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
| if not os.path.isfile(file_name): | |||||
| trials_complete = False | |||||
| break | |||||
| else: | |||||
| # Get data. | |||||
| data_group = [] | |||||
| for trial in range(1, 101): | |||||
| file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| data = pickle.load(f) | |||||
| data_group.append(data) | |||||
| else: # Not all trials are completed. | |||||
| return | |||||
| # Write groups. | |||||
| if label_name == 'ged_matrix': | |||||
| data_group = np.array(data_group) | |||||
| with open(name_group, 'wb') as f: | |||||
| np.save(f, data_group) | |||||
| else: | |||||
| with open(name_group, 'wb') as f: | |||||
| pickle.dump(data_group, f) | |||||
| trials_complete = True | |||||
| if trials_complete: | |||||
| # Backup. | |||||
| if backup: | |||||
| for trial in range(1, 101): | |||||
| src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
| dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
| copyfile(src, dst) | |||||
| # Clear. | |||||
| if clear: | |||||
| for trial in range(1, 101): | |||||
| src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
| os.remove(src) | |||||
| def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||||
| # Create folders. | |||||
| if not os.path.exists(dir_folder + 'groups/'): | |||||
| os.makedirs(dir_folder + 'groups/') | |||||
| if backup: | |||||
| if not os.path.exists(dir_folder + 'backups'): | |||||
| os.makedirs(dir_folder + 'backups') | |||||
| # Iterate all files. | |||||
| cur_file_prefix = '' | |||||
| for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): | |||||
| if os.path.isfile(os.path.join(dir_folder, file)): | |||||
| name_prefix = file.split('trial_')[0] | |||||
| # print(name) | |||||
| # print(name_prefix) | |||||
| if name_prefix != cur_file_prefix: | |||||
| group_trials(dir_folder, name_prefix, override, clear, backup) | |||||
| cur_file_prefix = name_prefix | |||||
| if __name__ == '__main__': | |||||
| dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' | |||||
| group_all_in_folder(dir_folder) | |||||
| dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||||
| group_all_in_folder(dir_folder) | |||||
| @@ -0,0 +1,30 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Thu Oct 29 19:17:36 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| from gklearn.utils import Dataset | |||||
| def get_dataset(ds_name): | |||||
| # The node/edge labels that will not be used in the computation. | |||||
| if ds_name == 'MAO': | |||||
| irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
| elif ds_name == 'Monoterpenoides': | |||||
| irrelevant_labels = {'edge_labels': ['valence']} | |||||
| elif ds_name == 'MUTAG': | |||||
| irrelevant_labels = {'edge_labels': ['label_0']} | |||||
| elif ds_name == 'AIDS_symb': | |||||
| irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||||
| ds_name = 'AIDS' | |||||
| # Initialize a Dataset. | |||||
| dataset = Dataset() | |||||
| # Load predefined dataset. | |||||
| dataset.load_predefined_dataset(ds_name) | |||||
| # Remove irrelevant labels. | |||||
| dataset.remove_labels(**irrelevant_labels) | |||||
| print('dataset size:', len(dataset.graphs)) | |||||
| return dataset | |||||