| @@ -15,24 +15,28 @@ import time | |||
| import random | |||
| from scipy import optimize | |||
| from scipy.optimize import minimize | |||
| import cvxpy as cp | |||
| import sys | |||
| #sys.path.insert(0, "../") | |||
| from ged import GED, get_nb_edit_operations | |||
| from utils import kernel_distance_matrix | |||
| sys.path.insert(0, "../") | |||
| from preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter | |||
| from preimage.utils import kernel_distance_matrix | |||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, | |||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||
| 'method': 'IPFP', 'stabilizer': None}, | |||
| init_costs=[3, 3, 1, 3, 3, 1], | |||
| dataset='monoterpenoides', | |||
| parallel=True): | |||
| dataset = dataset.lower() | |||
| # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | |||
| # random.seed(1) | |||
| # cost_rdm = random.sample(range(1, 10), 6) | |||
| # init_costs = cost_rdm + [0] | |||
| # init_costs = cost_rdm | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| # init_costs = [3, 3, 1, 3, 3, 1] | |||
| # init_costs = [i * 0.01 for i in cost_rdm] + [0] | |||
| # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||
| # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||
| @@ -51,8 +55,10 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||
| # init ged. | |||
| print('\ninitial:') | |||
| time0 = time.time() | |||
| params_ged['dataset'] = dataset | |||
| params_ged['edit_cost_constant'] = init_costs | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| dataset, | |||
| parallel=parallel) | |||
| residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | |||
| time_list = [time.time() - time0] | |||
| @@ -67,20 +73,21 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||
| time0 = time.time() | |||
| # "fit" geds to distances in feature space by tuning edit costs using the | |||
| # Least Squares Method. | |||
| edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) | |||
| edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, | |||
| dataset=dataset, cost=params_ged['cost']) | |||
| for i in range(len(edit_costs_new)): | |||
| if -1e-9 <= edit_costs_new[i] <= 1e-9: | |||
| edit_costs_new[i] = 0 | |||
| if edit_costs_new[i] < 0: | |||
| if edit_costs_new[i] > -1e-9: | |||
| edit_costs_new[i] = 0 | |||
| else: | |||
| raise ValueError('The edit cost is negative.') | |||
| raise ValueError('The edit cost is negative.') | |||
| # for i in range(len(edit_costs_new)): | |||
| # if edit_costs_new[i] < 0: | |||
| # edit_costs_new[i] = 0 | |||
| # compute new GEDs and numbers of edit operations. | |||
| params_ged['edit_cost_constant'] = edit_costs_new | |||
| ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) | |||
| ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| dataset, | |||
| parallel=parallel) | |||
| residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | |||
| time_list.append(time.time() - time0) | |||
| @@ -94,7 +101,8 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||
| time_list, nb_cost_mat_list | |||
| def compute_geds(Gn, params_ged, parallel=False): | |||
| def compute_geds(Gn, params_ged, dataset, parallel=False): | |||
| get_nb_eo = get_nb_edit_operations_letter if dataset == 'letter' else get_nb_edit_operations | |||
| ged_mat = np.zeros((len(Gn), len(Gn))) | |||
| if parallel: | |||
| # print('parallel') | |||
| @@ -112,7 +120,7 @@ def compute_geds(Gn, params_ged, parallel=False): | |||
| def init_worker(gn_toshare): | |||
| global G_gn | |||
| G_gn = gn_toshare | |||
| do_partial = partial(_wrapper_compute_ged_parallel, params_ged) | |||
| do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) | |||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | |||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
| desc='computing GEDs', file=sys.stdout) | |||
| @@ -138,26 +146,146 @@ def compute_geds(Gn, params_ged, parallel=False): | |||
| ged_vec.append(dis) | |||
| ged_mat[i][j] = dis | |||
| ged_mat[j][i] = dis | |||
| n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | |||
| n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) | |||
| n_edit_operations.append(n_eo_tmp) | |||
| return ged_vec, ged_mat, n_edit_operations | |||
| def _wrapper_compute_ged_parallel(params_ged, itr): | |||
| def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): | |||
| i = itr[0] | |||
| j = itr[1] | |||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) | |||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) | |||
| return i, j, dis, n_eo_tmp | |||
| def _compute_ged_parallel(g1, g2, params_ged): | |||
| def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): | |||
| dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | |||
| n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||
| n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] | |||
| return dis, n_eo_tmp | |||
| def update_costs(nb_cost_mat, dis_k_vec): | |||
| def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', | |||
| cost='CONSTANT', rw_constraints='2constraints'): | |||
| if dataset.lower() == 'letter': | |||
| if cost == 'LETTER': | |||
| pass | |||
| # # method 1: set alpha automatically, just tune c_vir and c_eir by | |||
| # # LMS using cvxpy. | |||
| # alpha = 0.5 | |||
| # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) | |||
| ## if np.count_nonzero(nb_cost_mat[:,4]) == 0: | |||
| ## alpha = 0.75 | |||
| ## else: | |||
| ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) | |||
| ## alpha = alpha * 0.99 | |||
| # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) | |||
| # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) | |||
| # nb_cost_mat_new = np.column_stack((param_vir, param_eir)) | |||
| # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] | |||
| # | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| # prob = cp.Problem(cp.Minimize(cost), constraints) | |||
| # prob.solve() | |||
| # edit_costs_new = x.value | |||
| # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) | |||
| # residual = np.sqrt(prob.value) | |||
| # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by | |||
| # # scipy.optimize.minimize. | |||
| # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||
| # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||
| # w2 = nb_cost_mat[:,3] | |||
| # w3 = dis_k_vec | |||
| # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||
| # + w2 * x[2] - w3 * x[3]) ** 2) | |||
| # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) | |||
| # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) | |||
| # edit_costs_new = res.x[0:3] | |||
| # residual = res.fun | |||
| # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. | |||
| # # method 4: tune c_vir, c_eir and alpha by QP function | |||
| # # scipy.optimize.least_squares. An initial guess is required. | |||
| # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||
| # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||
| # w2 = nb_cost_mat[:,3] | |||
| # w3 = dis_k_vec | |||
| # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||
| # + w2 * x[2] - w3 * x[3]) ** 2 | |||
| # res = optimize.root(func, [0.9, 1.7, 0.75, 100]) | |||
| # edit_costs_new = res.x | |||
| # residual = None | |||
| elif cost == 'LETTER2': | |||
| # # 1. if c_vi != c_vr, c_ei != c_er. | |||
| # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| ## # 1.1 no constraints. | |||
| ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| # # 1.2 c_vs <= c_vi + c_vr. | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| ## # 2. if c_vi == c_vr, c_ei == c_er. | |||
| ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] | |||
| ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] | |||
| ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] | |||
| ## x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| ## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| ## # 2.1 no constraints. | |||
| ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||
| ### # 2.2 c_vs <= c_vi + c_vr. | |||
| ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] | |||
| # | |||
| # prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| # prob.solve() | |||
| # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||
| # edit_costs_new = np.array(edit_costs_new) | |||
| # residual = np.sqrt(prob.value) | |||
| if rw_constraints == 'inequality': | |||
| # c_vs <= c_vi + c_vr. | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| elif rw_constraints == '2constraints': | |||
| # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. | |||
| nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| # elif method == 'inequality_modified': | |||
| # # c_vs <= c_vi + c_vr. | |||
| # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||
| # x = cp.Variable(nb_cost_mat_new.shape[1]) | |||
| # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||
| # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| # prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| # prob.solve() | |||
| # # use same costs for insertion and removal rather than the fitted costs. | |||
| # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||
| # edit_costs_new = np.array(edit_costs_new) | |||
| # residual = np.sqrt(prob.value) | |||
| else: | |||
| # # method 1: simple least square method. | |||
| # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | |||
| # rcond=None) | |||
| @@ -181,16 +309,16 @@ def update_costs(nb_cost_mat, dis_k_vec): | |||
| # G = -1 * np.identity(nb_cost_mat.shape[1]) | |||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | |||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||
| cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||
| constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||
| cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||
| constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = np.sqrt(prob.value) | |||
| # method 4: | |||
| @@ -13,33 +13,46 @@ import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| from gedlibpy_linlin import librariesImport, gedlibpy | |||
| #from gedlibpy_linlin import librariesImport, gedlibpy | |||
| from libs import * | |||
| def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | |||
| """ | |||
| Compute GED for 2 graphs. | |||
| """ | |||
| def convertGraph(G): | |||
| def convertGraph(G, dataset): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||
| # y=str(attrs['attributes'][1])) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| if dataset == 'monoterpenoides': | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| elif dataset == 'letter': | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||
| y=str(attrs['attributes'][1])) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| else: | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| # G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| dataset = dataset.lower() | |||
| if lib == 'gedlibpy': | |||
| gedlibpy.restart_env() | |||
| gedlibpy.add_nx_graph(convertGraph(g1), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g2), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g1, dataset), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g2, dataset), "") | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | |||
| @@ -310,6 +323,60 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map): | |||
| if map_i == np.inf: | |||
| n_vi += 1 | |||
| # idx_nodes1 = range(0, len(node1)) | |||
| edges1 = [e for e in g1.edges()] | |||
| nb_edges2_cnted = 0 | |||
| for n1, n2 in edges1: | |||
| idx1 = nodes1.index(n1) | |||
| idx2 = nodes1.index(n2) | |||
| # one of the nodes is removed, thus the edge is removed. | |||
| if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | |||
| n_er += 1 | |||
| # corresponding edge is in g2. | |||
| elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): | |||
| nb_edges2_cnted += 1 | |||
| # edge labels are different. | |||
| if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ | |||
| != g1.edges[(n1, n2)]['bond_type']: | |||
| n_es += 1 | |||
| elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): | |||
| nb_edges2_cnted += 1 | |||
| # edge labels are different. | |||
| if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ | |||
| != g1.edges[(n1, n2)]['bond_type']: | |||
| n_es += 1 | |||
| # corresponding nodes are in g2, however the edge is removed. | |||
| else: | |||
| n_er += 1 | |||
| n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
| return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||
| def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||
| """Compute the number of each edit operations. | |||
| """ | |||
| n_vi = 0 | |||
| n_vr = 0 | |||
| n_vs = 0 | |||
| sod_vs = 0 | |||
| n_ei = 0 | |||
| n_er = 0 | |||
| nodes1 = [n for n in g1.nodes()] | |||
| for i, map_i in enumerate(forward_map): | |||
| if map_i == np.inf: | |||
| n_vr += 1 | |||
| else: | |||
| n_vs += 1 | |||
| diff_x = float(g1.nodes[i]['x']) - float(g2.nodes[map_i]['x']) | |||
| diff_y = float(g1.nodes[i]['y']) - float(g2.nodes[map_i]['y']) | |||
| sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) | |||
| for map_i in backward_map: | |||
| if map_i == np.inf: | |||
| n_vi += 1 | |||
| # idx_nodes1 = range(0, len(node1)) | |||
| edges1 = [e for e in g1.edges()] | |||
| @@ -329,4 +396,8 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map): | |||
| n_er += 1 | |||
| n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
| return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||
| return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er | |||
| if __name__ == '__main__': | |||
| print('check test_ged.py') | |||
| @@ -436,7 +436,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||
| return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | |||
| def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||
| def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', | |||
| dataset='monoterpenoides', | |||
| graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | |||
| """Compute the iam by c++ implementation (gedlib) through bash. | |||
| """ | |||
| @@ -467,12 +468,12 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||
| # graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' | |||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||
| command += 'export LD_LIBRARY_PATH\n' | |||
| command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||
| command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | |||
| + ' \'' + graph_dir + '\' ' | |||
| + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' | |||
| if edit_cost_constant is None: | |||
| command += 'None' | |||
| else: | |||
| @@ -484,7 +485,7 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||
| output = stream.readlines() | |||
| # print(output) | |||
| sod_sm = float(output[0].strip()) | |||
| sod_gm= float(output[1].strip()) | |||
| sod_gm = float(output[1].strip()) | |||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||
| @@ -31,8 +31,9 @@ from fitDistance import fit_GED_to_kernel_distance | |||
| def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | |||
| graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', | |||
| edit_costs=None, group_min=None, dataset='monoterpenoides', | |||
| parallel=True): | |||
| cost='CONSTANT', parallel=True): | |||
| dataset = dataset.lower() | |||
| # # compute distances in kernel space. | |||
| # dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
| # Kmatrix=None, gkernel=gkernel) | |||
| @@ -50,32 +51,53 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||
| # group_min = (12, 13, 22, 29) # closest w.r.t path kernel | |||
| # group_min = (77, 85, 160, 171) # closest w.r.t ged | |||
| # group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | |||
| Gn_median = [Gn[g].copy() for g in group_min] | |||
| # fit edit costs. | |||
| if fit_method == 'random': # random | |||
| edit_cost_constant = random.sample(range(1, 10), 6) | |||
| if cost == 'LETTER': | |||
| edit_cost_constant = random.sample(range(1, 10), 3) | |||
| edit_cost_constant = [item * 0.1 for item in edit_cost_constant] | |||
| elif cost == 'LETTER2': | |||
| random.seed(time.time()) | |||
| edit_cost_constant = random.sample(range(1, 10), 5) | |||
| # edit_cost_constant = [item * 0.1 for item in edit_cost_constant] | |||
| else: | |||
| edit_cost_constant = random.sample(range(1, 10), 6) | |||
| print('edit costs used:', edit_cost_constant) | |||
| elif fit_method == 'expert': # expert | |||
| edit_cost_constant = [3, 3, 1, 3, 3, 1] | |||
| elif fit_method == 'k-graphs': | |||
| itr_max = 6 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| if cost == 'LETTER': | |||
| init_costs = [0.9, 1.7, 0.75] | |||
| elif cost == 'LETTER2': | |||
| init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||
| else: | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| # fit on k-graph subset | |||
| edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||
| init_costs=init_costs, dataset=dataset, parallel=True) | |||
| elif fit_method == 'whole-dataset': | |||
| itr_max = 6 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| if cost == 'LETTER': | |||
| init_costs = [0.9, 1.7, 0.75] | |||
| elif cost == 'LETTER2': | |||
| init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||
| else: | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| # fit on all subset | |||
| edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||
| node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||
| init_costs=init_costs, dataset=dataset, parallel=True) | |||
| elif fit_method == 'precomputed': | |||
| edit_cost_constant = edit_costs | |||
| @@ -83,14 +105,17 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||
| # compute set median and gen median using IAM (C++ through bash). | |||
| group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||
| sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | |||
| graph_dir=graph_dir, dataset=dataset) | |||
| cost=cost, graph_dir=graph_dir, | |||
| dataset=dataset) | |||
| # compute distances in kernel space. | |||
| Gn_median = [Gn[g].copy() for g in group_min] | |||
| set_median = loadGXL(fname_sm) | |||
| gen_median = loadGXL(fname_gm) | |||
| if dataset == 'Letter': | |||
| # print(gen_median.nodes(data=True)) | |||
| # print(gen_median.edges(data=True)) | |||
| if dataset == 'letter': | |||
| for g in Gn_median: | |||
| reform_attributes(g) | |||
| reform_attributes(set_median) | |||
| @@ -98,16 +123,19 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||
| # compute distance in kernel space for set median. | |||
| Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | |||
| None if dataset == 'Letter' else 'chem', | |||
| None if dataset == 'Letter' else 'valence', | |||
| None if dataset == 'letter' else 'chem', | |||
| None if dataset == 'letter' else 'valence', | |||
| False) | |||
| dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | |||
| # print(gen_median.nodes(data=True)) | |||
| # print(gen_median.edges(data=True)) | |||
| # print(set_median.nodes(data=True)) | |||
| # print(set_median.edges(data=True)) | |||
| # compute distance in kernel space for generalized median. | |||
| Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | |||
| None if dataset == 'Letter' else 'chem', | |||
| None if dataset == 'Letter' else 'valence', | |||
| None if dataset == 'letter' else 'chem', | |||
| None if dataset == 'letter' else 'valence', | |||
| False) | |||
| dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
| [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | |||
| @@ -61,8 +61,8 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'treeletkernel': | |||
| # pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
| pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
| pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
| # pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| sub_kernel=pkernel, | |||
| @@ -19,11 +19,13 @@ from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_at | |||
| from preimage.utils import get_same_item_indices | |||
| from preimage.find_best_k import getRelations | |||
| def xp_letter_h(): | |||
| ds = {'name': 'Letter-high', | |||
| 'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||
| def xp_letter_h_LETTER2_cost(): | |||
| ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||
| 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| for G in Gn: | |||
| reform_attributes(G) | |||
| # ds = {'name': 'Letter-high', | |||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset']) | |||
| @@ -33,32 +35,35 @@ def xp_letter_h(): | |||
| edge_label = None | |||
| ds_name = 'letter-h' | |||
| dir_output = 'results/xp_letter_h/' | |||
| save_results = True | |||
| cost = 'LETTER2' | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [150] | |||
| fit_method = 'precomputed' | |||
| fit_method = 'k-graphs' | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| if save_results: | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| @@ -82,11 +87,11 @@ def xp_letter_h(): | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| # y = 'I' | |||
| # y = 'F' | |||
| # values = y_idx[y] | |||
| # values = values[0:10] | |||
| # k = len(values) | |||
| # k = kkk | |||
| k = len(values) | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| @@ -114,20 +119,21 @@ def xp_letter_h(): | |||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
| edit_costs=None, group_min=median_set_idx_idx, | |||
| dataset='Letter', parallel=False) | |||
| dataset='Letter', cost=cost, parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| if save_results: | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| @@ -170,14 +176,17 @@ def xp_letter_h(): | |||
| # save median graphs. | |||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||
| # plot median graphs. | |||
| @@ -197,16 +206,17 @@ def xp_letter_h(): | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| @@ -219,13 +229,232 @@ def xp_letter_h(): | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| def xp_letter_h(): | |||
| ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||
| 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
| for G in Gn: | |||
| reform_attributes(G) | |||
| # ds = {'name': 'Letter-high', | |||
| # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:50] | |||
| gkernel = 'structuralspkernel' | |||
| node_label = None | |||
| edge_label = None | |||
| ds_name = 'letter-h' | |||
| dir_output = 'results/xp_letter_h/' | |||
| save_results = False | |||
| repeats = 1 | |||
| # k_list = range(2, 11) | |||
| k_list = [150] | |||
| fit_method = 'k-graphs' | |||
| # get indices by classes. | |||
| y_idx = get_same_item_indices(y_all) | |||
| if save_results: | |||
| # create result files. | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', 'median set']) | |||
| f_detail.close() | |||
| fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
| 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
| 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
| 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
| '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
| 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
| 'repeats better dis_k gi -> GM']) | |||
| f_summary.close() | |||
| random.seed(1) | |||
| rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
| for k in k_list: | |||
| print('\n--------- k =', k, '----------') | |||
| sod_sm_mean_list = [] | |||
| sod_gm_mean_list = [] | |||
| dis_k_sm_mean_list = [] | |||
| dis_k_gm_mean_list = [] | |||
| dis_k_gi_min_mean_list = [] | |||
| # nb_sod_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_sm2gm = [0, 0, 0] | |||
| # nb_dis_k_gi2sm = [0, 0, 0] | |||
| # nb_dis_k_gi2gm = [0, 0, 0] | |||
| # repeats_better_sod_sm2gm = [] | |||
| # repeats_better_dis_k_sm2gm = [] | |||
| # repeats_better_dis_k_gi2sm = [] | |||
| # repeats_better_dis_k_gi2gm = [] | |||
| for i, (y, values) in enumerate(y_idx.items()): | |||
| print('\ny =', y) | |||
| # y = 'N' | |||
| # values = y_idx[y] | |||
| # values = values[0:10] | |||
| k = len(values) | |||
| sod_sm_list = [] | |||
| sod_gm_list = [] | |||
| dis_k_sm_list = [] | |||
| dis_k_gm_list = [] | |||
| dis_k_gi_min_list = [] | |||
| nb_sod_sm2gm = [0, 0, 0] | |||
| nb_dis_k_sm2gm = [0, 0, 0] | |||
| nb_dis_k_gi2sm = [0, 0, 0] | |||
| nb_dis_k_gi2gm = [0, 0, 0] | |||
| repeats_better_sod_sm2gm = [] | |||
| repeats_better_dis_k_sm2gm = [] | |||
| repeats_better_dis_k_gi2sm = [] | |||
| repeats_better_dis_k_gi2gm = [] | |||
| for repeat in range(repeats): | |||
| print('\nrepeat =', repeat) | |||
| random.seed(rdn_seed_list[repeat]) | |||
| median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
| median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
| print('median set: ', median_set_idx) | |||
| Gn_median = [Gn[g] for g in values] | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||
| = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
| gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
| edit_costs=None, group_min=median_set_idx_idx, | |||
| dataset='Letter', parallel=False) | |||
| # write result detail. | |||
| sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
| dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
| dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
| dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
| if save_results: | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
| y, repeat, | |||
| sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
| dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
| dis_k_gi2gm, median_set_idx]) | |||
| f_detail.close() | |||
| # compute result summary. | |||
| sod_sm_list.append(sod_sm) | |||
| sod_gm_list.append(sod_gm) | |||
| dis_k_sm_list.append(dis_k_sm) | |||
| dis_k_gm_list.append(dis_k_gm) | |||
| dis_k_gi_min_list.append(dis_k_gi_min) | |||
| # # SOD SM -> GM | |||
| if sod_sm > sod_gm: | |||
| nb_sod_sm2gm[0] += 1 | |||
| repeats_better_sod_sm2gm.append(repeat) | |||
| elif sod_sm == sod_gm: | |||
| nb_sod_sm2gm[1] += 1 | |||
| elif sod_sm < sod_gm: | |||
| nb_sod_sm2gm[2] += 1 | |||
| # # dis_k SM -> GM | |||
| if dis_k_sm > dis_k_gm: | |||
| nb_dis_k_sm2gm[0] += 1 | |||
| repeats_better_dis_k_sm2gm.append(repeat) | |||
| elif dis_k_sm == dis_k_gm: | |||
| nb_dis_k_sm2gm[1] += 1 | |||
| elif dis_k_sm < dis_k_gm: | |||
| nb_dis_k_sm2gm[2] += 1 | |||
| # # dis_k gi -> SM | |||
| if dis_k_gi_min > dis_k_sm: | |||
| nb_dis_k_gi2sm[0] += 1 | |||
| repeats_better_dis_k_gi2sm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_sm: | |||
| nb_dis_k_gi2sm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_sm: | |||
| nb_dis_k_gi2sm[2] += 1 | |||
| # # dis_k gi -> GM | |||
| if dis_k_gi_min > dis_k_gm: | |||
| nb_dis_k_gi2gm[0] += 1 | |||
| repeats_better_dis_k_gi2gm.append(repeat) | |||
| elif dis_k_gi_min == dis_k_gm: | |||
| nb_dis_k_gi2gm[1] += 1 | |||
| elif dis_k_gi_min < dis_k_gm: | |||
| nb_dis_k_gi2gm[2] += 1 | |||
| # save median graphs. | |||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||
| fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||
| fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
| G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
| reform_attributes(G_best_kernel) | |||
| fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||
| + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
| saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||
| # plot median graphs. | |||
| set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
| gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
| draw_Letter_graph(set_median, fn_pre_sm_new) | |||
| draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
| draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
| # write result summary for each letter. | |||
| sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
| sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
| dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
| dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
| dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
| sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
| dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
| dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
| nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
| repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
| repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
| f_summary.close() | |||
| # write result summary for each letter. | |||
| sod_sm_mean = np.mean(sod_sm_mean_list) | |||
| sod_gm_mean = np.mean(sod_gm_mean_list) | |||
| dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
| dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
| dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
| sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
| dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
| dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
| dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
| if save_results: | |||
| f_summary = open(dir_output + fn_output_summary, 'a') | |||
| csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
| sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
| dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
| dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
| f_summary.close() | |||
| print('\ncomplete.') | |||
| @@ -243,4 +472,5 @@ def draw_Letter_graph(graph, file_prefix): | |||
| if __name__ == "__main__": | |||
| xp_letter_h() | |||
| # xp_letter_h() | |||
| xp_letter_h_LETTER2_cost() | |||