| @@ -0,0 +1,172 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Thu Jan 9 11:54:32 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| import numpy as np | |||||
| import random | |||||
| import csv | |||||
| import sys | |||||
| sys.path.insert(0, "../") | |||||
| from pygraph.utils.graphfiles import loadDataset | |||||
| from preimage.test_k_closest_graphs import median_on_k_closest_graphs | |||||
| def find_best_k(): | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:50] | |||||
| gkernel = 'treeletkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| ds_name = 'mono' | |||||
| dir_output = 'results/test_find_best_k/' | |||||
| repeats = 50 | |||||
| k_list = range(2, 11) | |||||
| fit_method = 'k-graphs' | |||||
| # fitted on the whole dataset - treelet - mono | |||||
| edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] | |||||
| # create result files. | |||||
| fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
| 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM']) | |||||
| f_detail.close() | |||||
| fn_output_summary = 'results_summary.csv' | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
| 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
| 'repeats better dis_k gi -> GM']) | |||||
| f_summary.close() | |||||
| random.seed(1) | |||||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||||
| for k in k_list: | |||||
| print('\n--------- k =', k, '----------') | |||||
| sod_sm_list = [] | |||||
| sod_gm_list = [] | |||||
| dis_k_sm_list = [] | |||||
| dis_k_gm_list = [] | |||||
| dis_k_gi_min_list = [] | |||||
| nb_sod_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_gi2sm = [0, 0, 0] | |||||
| nb_dis_k_gi2gm = [0, 0, 0] | |||||
| repeats_better_sod_sm2gm = [] | |||||
| repeats_better_dis_k_sm2gm = [] | |||||
| repeats_better_dis_k_gi2sm = [] | |||||
| repeats_better_dis_k_gi2gm = [] | |||||
| for repeat in range(repeats): | |||||
| print('\nrepeat =', repeat) | |||||
| random.seed(rdn_seed_list[repeat]) | |||||
| median_set_idx = random.sample(range(0, len(Gn)), k) | |||||
| print('median set: ', median_set_idx) | |||||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||||
| fit_method='k-graphs', | |||||
| edit_costs=edit_costs, | |||||
| group_min=median_set_idx, | |||||
| parallel=False) | |||||
| # write result detail. | |||||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, | |||||
| median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||||
| dis_k_gi2gm]) | |||||
| f_detail.close() | |||||
| # compute result summary. | |||||
| sod_sm_list.append(sod_sm) | |||||
| sod_gm_list.append(sod_gm) | |||||
| dis_k_sm_list.append(dis_k_sm) | |||||
| dis_k_gm_list.append(dis_k_gm) | |||||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||||
| # # SOD SM -> GM | |||||
| if sod_sm > sod_gm: | |||||
| nb_sod_sm2gm[0] += 1 | |||||
| repeats_better_sod_sm2gm.append(repeat) | |||||
| elif sod_sm == sod_gm: | |||||
| nb_sod_sm2gm[1] += 1 | |||||
| elif sod_sm < sod_gm: | |||||
| nb_sod_sm2gm[2] += 1 | |||||
| # # dis_k SM -> GM | |||||
| if dis_k_sm > dis_k_gm: | |||||
| nb_dis_k_sm2gm[0] += 1 | |||||
| repeats_better_dis_k_sm2gm.append(repeat) | |||||
| elif dis_k_sm == dis_k_gm: | |||||
| nb_dis_k_sm2gm[1] += 1 | |||||
| elif dis_k_sm < dis_k_gm: | |||||
| nb_dis_k_sm2gm[2] += 1 | |||||
| # # dis_k gi -> SM | |||||
| if dis_k_gi_min > dis_k_sm: | |||||
| nb_dis_k_gi2sm[0] += 1 | |||||
| repeats_better_dis_k_gi2sm.append(repeat) | |||||
| elif dis_k_gi_min == dis_k_sm: | |||||
| nb_dis_k_gi2sm[1] += 1 | |||||
| elif dis_k_gi_min < dis_k_sm: | |||||
| nb_dis_k_gi2sm[2] += 1 | |||||
| # # dis_k gi -> GM | |||||
| if dis_k_gi_min > dis_k_gm: | |||||
| nb_dis_k_gi2gm[0] += 1 | |||||
| repeats_better_dis_k_gi2gm.append(repeat) | |||||
| elif dis_k_gi_min == dis_k_gm: | |||||
| nb_dis_k_gi2gm[1] += 1 | |||||
| elif dis_k_gi_min < dis_k_gm: | |||||
| nb_dis_k_gi2gm[2] += 1 | |||||
| # write result summary. | |||||
| sod_sm_mean = np.mean(sod_sm_list) | |||||
| sod_gm_mean = np.mean(sod_gm_list) | |||||
| dis_k_sm_mean = np.mean(dis_k_sm_list) | |||||
| dis_k_gm_mean = np.mean(dis_k_gm_list) | |||||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, | |||||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||||
| f_summary.close() | |||||
| print('\ncomplete.') | |||||
| return | |||||
| def getRelations(sign): | |||||
| if sign == -1: | |||||
| return 'better' | |||||
| elif sign == 0: | |||||
| return 'same' | |||||
| elif sign == 1: | |||||
| return 'worse' | |||||
| if __name__ == '__main__': | |||||
| find_best_k() | |||||
| @@ -0,0 +1,336 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Dec 16 11:53:54 2019 | |||||
| @author: ljia | |||||
| """ | |||||
| import numpy as np | |||||
| import math | |||||
| import networkx as nx | |||||
| import matplotlib.pyplot as plt | |||||
| import time | |||||
| import random | |||||
| from tqdm import tqdm | |||||
| from itertools import combinations, islice | |||||
| import multiprocessing | |||||
| from multiprocessing import Pool | |||||
| from functools import partial | |||||
| #import os | |||||
| import sys | |||||
| sys.path.insert(0, "../") | |||||
| from pygraph.utils.graphfiles import loadDataset, loadGXL | |||||
| #from pygraph.utils.logger2file import * | |||||
| from iam import iam_upgraded, iam_bash | |||||
| from utils import compute_kernel, dis_gstar, kernel_distance_matrix | |||||
| from fitDistance import fit_GED_to_kernel_distance | |||||
| #from ged import ged_median | |||||
| def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | |||||
| graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', | |||||
| edit_costs=None, group_min=None, dataset='monoterpenoides', | |||||
| parallel=True): | |||||
| # # compute distances in kernel space. | |||||
| # dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||||
| # Kmatrix=None, gkernel=gkernel) | |||||
| # # ged. | |||||
| # gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') | |||||
| # ged_mat = gmfile['ged_mat'] | |||||
| # dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] | |||||
| # # choose k closest graphs | |||||
| # time0 = time.time() | |||||
| # sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) | |||||
| # time_spent = time.time() - time0 | |||||
| # print('closest graphs:', sod_ks_min, group_min) | |||||
| # print('time spent:', time_spent) | |||||
| # group_min = (12, 13, 22, 29) # closest w.r.t path kernel | |||||
| # group_min = (77, 85, 160, 171) # closest w.r.t ged | |||||
| # group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | |||||
| Gn_median = [Gn[g].copy() for g in group_min] | |||||
| # fit edit costs. | |||||
| if fit_method == 'random': # random | |||||
| edit_cost_constant = random.sample(range(1, 10), 6) | |||||
| print('edit costs used:', edit_cost_constant) | |||||
| elif fit_method == 'expert': # expert | |||||
| edit_cost_constant = [3, 3, 1, 3, 3, 1] | |||||
| elif fit_method == 'k-graphs': | |||||
| itr_max = 6 | |||||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
| 'algo_options': algo_options, 'stabilizer': None} | |||||
| # fit on k-graph subset | |||||
| edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | |||||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||||
| elif fit_method == 'whole-dataset': | |||||
| itr_max = 6 | |||||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
| 'algo_options': algo_options, 'stabilizer': None} | |||||
| # fit on all subset | |||||
| edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | |||||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||||
| elif fit_method == 'precomputed': | |||||
| edit_cost_constant = edit_costs | |||||
| # compute set median and gen median using IAM (C++ through bash). | |||||
| group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||||
| sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | |||||
| graph_dir=graph_dir, dataset=dataset) | |||||
| # compute distances in kernel space. | |||||
| Gn_median = [Gn[g].copy() for g in group_min] | |||||
| set_median = loadGXL(fname_sm) | |||||
| gen_median = loadGXL(fname_gm) | |||||
| if dataset == 'Letter': | |||||
| for g in Gn_median: | |||||
| reform_attributes(g) | |||||
| reform_attributes(set_median) | |||||
| reform_attributes(gen_median) | |||||
| # compute distance in kernel space for set median. | |||||
| Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | |||||
| None if dataset == 'Letter' else 'chem', | |||||
| None if dataset == 'Letter' else 'valence', | |||||
| False) | |||||
| dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | |||||
| # compute distance in kernel space for generalized median. | |||||
| Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | |||||
| None if dataset == 'Letter' else 'chem', | |||||
| None if dataset == 'Letter' else 'valence', | |||||
| False) | |||||
| dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | |||||
| # compute distance in kernel space for each graph in median set. | |||||
| dis_k_gi = [] | |||||
| for idx in range(len(Gn_median)): | |||||
| dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), | |||||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) | |||||
| print('sod_sm:', sod_sm) | |||||
| print('sod_gm:', sod_gm) | |||||
| print('dis_k_sm:', dis_k_sm) | |||||
| print('dis_k_gm:', dis_k_gm) | |||||
| print('dis_k_gi:', dis_k_gi) | |||||
| idx_dis_k_gi_min = np.argmin(dis_k_gi) | |||||
| dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] | |||||
| print('index min dis_k_gi:', group_min[idx_dis_k_gi_min]) | |||||
| print('min dis_k_gi:', dis_k_gi_min) | |||||
| return sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, group_min[idx_dis_k_gi_min] | |||||
| def reform_attributes(G): | |||||
| for node in G.nodes: | |||||
| G.nodes[node]['attributes'] = [G.nodes[node]['x'], G.nodes[node]['y']] | |||||
| def get_closest_k_graphs(dis_mat, k, parallel): | |||||
| k_graph_groups = combinations(range(0, len(dis_mat)), k) | |||||
| sod_ks_min = np.inf | |||||
| if parallel: | |||||
| len_combination = get_combination_length(len(dis_mat), k) | |||||
| len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) | |||||
| # pos_cur = 0 | |||||
| graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) | |||||
| for graph_groups_cur in graph_groups_slices: | |||||
| # while True: | |||||
| # graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) | |||||
| graph_groups_cur_list = list(graph_groups_cur) | |||||
| print('current position:', graph_groups_cur_list[0]) | |||||
| len_itr_cur = len(graph_groups_cur_list) | |||||
| # if len_itr_cur < len_itr_max: | |||||
| # break | |||||
| itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) | |||||
| sod_k_list = np.empty(len_itr_cur) | |||||
| graphs_list = [None] * len_itr_cur | |||||
| n_jobs = multiprocessing.cpu_count() | |||||
| chunksize = int(len_itr_max / n_jobs + 1) | |||||
| n_jobs = multiprocessing.cpu_count() | |||||
| def init_worker(dis_mat_toshare): | |||||
| global G_dis_mat | |||||
| G_dis_mat = dis_mat_toshare | |||||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) | |||||
| # iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, | |||||
| # itr, chunksize), | |||||
| # desc='Choosing k closest graphs', file=sys.stdout) | |||||
| iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) | |||||
| for graphs, i, sod_ks in iterator: | |||||
| sod_k_list[i] = sod_ks | |||||
| graphs_list[i] = graphs | |||||
| pool.close() | |||||
| pool.join() | |||||
| arg_min = np.argmin(sod_k_list) | |||||
| sod_ks_cur = sod_k_list[arg_min] | |||||
| group_cur = graphs_list[arg_min] | |||||
| if sod_ks_cur < sod_ks_min: | |||||
| sod_ks_min = sod_ks_cur | |||||
| group_min = group_cur | |||||
| print('get closer graphs:', sod_ks_min, group_min) | |||||
| else: | |||||
| for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): | |||||
| # if items[0] != itmp: | |||||
| # itmp = items[0] | |||||
| # print(items) | |||||
| k_graph_pairs = combinations(items, 2) | |||||
| sod_ks = 0 | |||||
| for i1, i2 in k_graph_pairs: | |||||
| sod_ks += dis_mat[i1, i2] | |||||
| if sod_ks < sod_ks_min: | |||||
| sod_ks_min = sod_ks | |||||
| group_min = items | |||||
| print('get closer graphs:', sod_ks_min, group_min) | |||||
| return sod_ks_min, group_min | |||||
| def _get_closest_k_graphs_parallel(itr): | |||||
| k_graph_pairs = combinations(itr[0], 2) | |||||
| sod_ks = 0 | |||||
| for i1, i2 in k_graph_pairs: | |||||
| sod_ks += G_dis_mat[i1, i2] | |||||
| return itr[0], itr[1], sod_ks | |||||
| def split_iterable(iterable, n, len_iter): | |||||
| it = iter(iterable) | |||||
| for i in range(0, len_iter, n): | |||||
| piece = islice(it, n) | |||||
| yield piece | |||||
| def get_combination_length(n, k): | |||||
| len_combination = 1 | |||||
| for i in range(n, n - k, -1): | |||||
| len_combination *= i | |||||
| return int(len_combination / math.factorial(k)) | |||||
| ############################################################################### | |||||
| def test_k_closest_graphs(): | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:50] | |||||
| # gkernel = 'untilhpathkernel' | |||||
| # gkernel = 'weisfeilerlehmankernel' | |||||
| gkernel = 'treeletkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| k = 5 | |||||
| edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||||
| # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||||
| # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||||
| # 'precomputed', edit_costs=edit_costs, | |||||
| ## 'k-graphs', | |||||
| # parallel=False) | |||||
| # | |||||
| # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||||
| # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||||
| # 'expert', parallel=False) | |||||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||||
| 'expert', parallel=False) | |||||
| return | |||||
| def test_k_closest_graphs_with_cv(): | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| k = 4 | |||||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
| repeats = 50 | |||||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
| graph_dir = collection_path + 'gxl/' | |||||
| sod_sm_list = [] | |||||
| sod_gm_list = [] | |||||
| dis_k_sm_list = [] | |||||
| dis_k_gm_list = [] | |||||
| dis_k_gi_min_list = [] | |||||
| for y in y_all: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('class of y:', y) | |||||
| sod_sm_list.append([]) | |||||
| sod_gm_list.append([]) | |||||
| dis_k_sm_list.append([]) | |||||
| dis_k_gm_list.append([]) | |||||
| dis_k_gi_min_list.append([]) | |||||
| for repeat in range(repeats): | |||||
| print('\nrepeat ', repeat) | |||||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||||
| = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, | |||||
| k, 'whole-dataset', graph_dir=graph_dir, | |||||
| parallel=False) | |||||
| sod_sm_list[-1].append(sod_sm) | |||||
| sod_gm_list[-1].append(sod_gm) | |||||
| dis_k_sm_list[-1].append(dis_k_sm) | |||||
| dis_k_gm_list[-1].append(dis_k_gm) | |||||
| dis_k_gi_min_list[-1].append(dis_k_gi_min) | |||||
| print('\nsods of the set median for this class:', sod_sm_list[-1]) | |||||
| print('\nsods of the gen median for this class:', sod_gm_list[-1]) | |||||
| print('\ndistances in kernel space of set median for this class:', | |||||
| dis_k_sm_list[-1]) | |||||
| print('\ndistances in kernel space of gen median for this class:', | |||||
| dis_k_gm_list[-1]) | |||||
| print('\ndistances in kernel space of min graph for this class:', | |||||
| dis_k_gi_min_list[-1]) | |||||
| sod_sm_list[-1] = np.mean(sod_sm_list[-1]) | |||||
| sod_gm_list[-1] = np.mean(sod_gm_list[-1]) | |||||
| dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) | |||||
| dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) | |||||
| dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) | |||||
| print() | |||||
| print('\nmean sods of the set median for each class:', sod_sm_list) | |||||
| print('\nmean sods of the gen median for each class:', sod_gm_list) | |||||
| print('\nmean distance in kernel space of set median for each class:', | |||||
| dis_k_sm_list) | |||||
| print('\nmean distances in kernel space of gen median for each class:', | |||||
| dis_k_gm_list) | |||||
| print('\nmean distances in kernel space of min graph for each class:', | |||||
| dis_k_gi_min_list) | |||||
| print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) | |||||
| print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) | |||||
| print('\nmean distances in kernel space of set median of all:', | |||||
| np.mean(dis_k_sm_list)) | |||||
| print('\nmean distances in kernel space of gen median of all:', | |||||
| np.mean(dis_k_gm_list)) | |||||
| print('\nmean distances in kernel space of min graph of all:', | |||||
| np.mean(dis_k_gi_min_list)) | |||||
| return | |||||
| if __name__ == '__main__': | |||||
| test_k_closest_graphs() | |||||
| # test_k_closest_graphs_with_cv() | |||||
| @@ -0,0 +1,246 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Tue Jan 14 15:39:29 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| import numpy as np | |||||
| import random | |||||
| import csv | |||||
| from shutil import copyfile | |||||
| import networkx as nx | |||||
| import matplotlib.pyplot as plt | |||||
| import sys | |||||
| sys.path.insert(0, "../") | |||||
| from pygraph.utils.graphfiles import loadDataset, loadGXL, saveGXL | |||||
| from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes | |||||
| from preimage.utils import get_same_item_indices | |||||
| from preimage.find_best_k import getRelations | |||||
| def xp_letter_h(): | |||||
| ds = {'name': 'Letter-high', | |||||
| 'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||||
| 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||||
| # ds = {'name': 'Letter-high', | |||||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||||
| # Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:50] | |||||
| gkernel = 'structuralspkernel' | |||||
| node_label = None | |||||
| edge_label = None | |||||
| ds_name = 'letter-h' | |||||
| dir_output = 'results/xp_letter_h/' | |||||
| repeats = 1 | |||||
| # k_list = range(2, 11) | |||||
| k_list = [150] | |||||
| fit_method = 'precomputed' | |||||
| # get indices by classes. | |||||
| y_idx = get_same_item_indices(y_all) | |||||
| # create result files. | |||||
| fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM', 'median set']) | |||||
| f_detail.close() | |||||
| fn_output_summary = 'results_summary.csv' | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
| 'repeats better dis_k gi -> GM']) | |||||
| f_summary.close() | |||||
| random.seed(1) | |||||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||||
| for k in k_list: | |||||
| print('\n--------- k =', k, '----------') | |||||
| sod_sm_mean_list = [] | |||||
| sod_gm_mean_list = [] | |||||
| dis_k_sm_mean_list = [] | |||||
| dis_k_gm_mean_list = [] | |||||
| dis_k_gi_min_mean_list = [] | |||||
| # nb_sod_sm2gm = [0, 0, 0] | |||||
| # nb_dis_k_sm2gm = [0, 0, 0] | |||||
| # nb_dis_k_gi2sm = [0, 0, 0] | |||||
| # nb_dis_k_gi2gm = [0, 0, 0] | |||||
| # repeats_better_sod_sm2gm = [] | |||||
| # repeats_better_dis_k_sm2gm = [] | |||||
| # repeats_better_dis_k_gi2sm = [] | |||||
| # repeats_better_dis_k_gi2gm = [] | |||||
| for i, (y, values) in enumerate(y_idx.items()): | |||||
| print('\ny =', y) | |||||
| # y = 'I' | |||||
| # values = y_idx[y] | |||||
| # k = len(values) | |||||
| # k = kkk | |||||
| sod_sm_list = [] | |||||
| sod_gm_list = [] | |||||
| dis_k_sm_list = [] | |||||
| dis_k_gm_list = [] | |||||
| dis_k_gi_min_list = [] | |||||
| nb_sod_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_sm2gm = [0, 0, 0] | |||||
| nb_dis_k_gi2sm = [0, 0, 0] | |||||
| nb_dis_k_gi2gm = [0, 0, 0] | |||||
| repeats_better_sod_sm2gm = [] | |||||
| repeats_better_dis_k_sm2gm = [] | |||||
| repeats_better_dis_k_gi2sm = [] | |||||
| repeats_better_dis_k_gi2gm = [] | |||||
| for repeat in range(repeats): | |||||
| print('\nrepeat =', repeat) | |||||
| random.seed(rdn_seed_list[repeat]) | |||||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||||
| print('median set: ', median_set_idx) | |||||
| Gn_median = [Gn[g] for g in values] | |||||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||||
| edit_costs=None, group_min=median_set_idx_idx, | |||||
| dataset='Letter', parallel=False) | |||||
| # write result detail. | |||||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||||
| y, repeat, | |||||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||||
| dis_k_gi2gm, median_set_idx]) | |||||
| f_detail.close() | |||||
| # compute result summary. | |||||
| sod_sm_list.append(sod_sm) | |||||
| sod_gm_list.append(sod_gm) | |||||
| dis_k_sm_list.append(dis_k_sm) | |||||
| dis_k_gm_list.append(dis_k_gm) | |||||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||||
| # # SOD SM -> GM | |||||
| if sod_sm > sod_gm: | |||||
| nb_sod_sm2gm[0] += 1 | |||||
| repeats_better_sod_sm2gm.append(repeat) | |||||
| elif sod_sm == sod_gm: | |||||
| nb_sod_sm2gm[1] += 1 | |||||
| elif sod_sm < sod_gm: | |||||
| nb_sod_sm2gm[2] += 1 | |||||
| # # dis_k SM -> GM | |||||
| if dis_k_sm > dis_k_gm: | |||||
| nb_dis_k_sm2gm[0] += 1 | |||||
| repeats_better_dis_k_sm2gm.append(repeat) | |||||
| elif dis_k_sm == dis_k_gm: | |||||
| nb_dis_k_sm2gm[1] += 1 | |||||
| elif dis_k_sm < dis_k_gm: | |||||
| nb_dis_k_sm2gm[2] += 1 | |||||
| # # dis_k gi -> SM | |||||
| if dis_k_gi_min > dis_k_sm: | |||||
| nb_dis_k_gi2sm[0] += 1 | |||||
| repeats_better_dis_k_gi2sm.append(repeat) | |||||
| elif dis_k_gi_min == dis_k_sm: | |||||
| nb_dis_k_gi2sm[1] += 1 | |||||
| elif dis_k_gi_min < dis_k_sm: | |||||
| nb_dis_k_gi2sm[2] += 1 | |||||
| # # dis_k gi -> GM | |||||
| if dis_k_gi_min > dis_k_gm: | |||||
| nb_dis_k_gi2gm[0] += 1 | |||||
| repeats_better_dis_k_gi2gm.append(repeat) | |||||
| elif dis_k_gi_min == dis_k_gm: | |||||
| nb_dis_k_gi2gm[1] += 1 | |||||
| elif dis_k_gi_min < dis_k_gm: | |||||
| nb_dis_k_gi2gm[2] += 1 | |||||
| # save median graphs. | |||||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||||
| fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||||
| fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||||
| reform_attributes(G_best_kernel) | |||||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||||
| # plot median graphs. | |||||
| set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||||
| gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||||
| draw_Letter_graph(set_median, fn_pre_sm_new) | |||||
| draw_Letter_graph(gen_median, fn_pre_gm_new) | |||||
| draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||||
| # write result summary for each letter. | |||||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||||
| f_summary.close() | |||||
| # write result summary for each letter. | |||||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||||
| f_summary.close() | |||||
| print('\ncomplete.') | |||||
| #Dessin median courrant | |||||
| def draw_Letter_graph(graph, file_prefix): | |||||
| plt.figure() | |||||
| pos = {} | |||||
| for n in graph.nodes: | |||||
| pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||||
| nx.draw_networkx(graph, pos) | |||||
| plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| if __name__ == "__main__": | |||||
| xp_letter_h() | |||||