update pre-image method for Letter-h dataset

5 years ago · 5b91920275
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -15,24 +15,28 @@ import time
 import random
 from scipy import optimize
 from scipy.optimize import minimize
 import cvxpy as cp
 import sys
 #sys.path.insert(0, "../")
 from ged import GED, get_nb_edit_operations
 from utils import kernel_distance_matrix
 sys.path.insert(0, "../")
 from preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter
 from preimage.utils import kernel_distance_matrix
 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
                               params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                                           'method': 'IPFP', 'stabilizer': None},
                               init_costs=[3, 3, 1, 3, 3, 1],
                               dataset='monoterpenoides',
                               parallel=True):
    dataset = dataset.lower()
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
 #    random.seed(1)
 #    cost_rdm = random.sample(range(1, 10), 6)
 #    init_costs = cost_rdm + [0]
 #    init_costs = cost_rdm
    init_costs = [3, 3, 1, 3, 3, 1]
 #    init_costs = [3, 3, 1, 3, 3, 1]
 #    init_costs = [i * 0.01 for i in cost_rdm] + [0]
 #    init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
 #    init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
@@ -51,8 +55,10 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4
    # init ged.
    print('\ninitial:')
    time0 = time.time()
    params_ged['dataset'] = dataset
    params_ged['edit_cost_constant'] = init_costs
    ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, 
                                                            dataset,
                                                            parallel=parallel)
    residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]    
    time_list = [time.time() - time0]
@@ -67,20 +73,21 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4
        time0 = time.time()
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
        edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, 
                                                dataset=dataset, cost=params_ged['cost'])
        for i in range(len(edit_costs_new)):
            if -1e-9 <= edit_costs_new[i] <= 1e-9:
                edit_costs_new[i] = 0
            if edit_costs_new[i] < 0:
                if edit_costs_new[i] > -1e-9:
                    edit_costs_new[i] = 0
                else:
                    raise ValueError('The edit cost is negative.')
                raise ValueError('The edit cost is negative.')
 #        for i in range(len(edit_costs_new)):
 #            if edit_costs_new[i] < 0:
 #                edit_costs_new[i] = 0
        # compute new GEDs and numbers of edit operations.
        params_ged['edit_cost_constant'] = edit_costs_new
        ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, 
        params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75])
        ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
                                                           dataset,
                                                           parallel=parallel)
        residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
        time_list.append(time.time() - time0)
@@ -94,7 +101,8 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4
        time_list, nb_cost_mat_list
 def compute_geds(Gn, params_ged, parallel=False):
 def compute_geds(Gn, params_ged, dataset, parallel=False):
    get_nb_eo = get_nb_edit_operations_letter if dataset == 'letter' else get_nb_edit_operations
    ged_mat = np.zeros((len(Gn), len(Gn)))
    if parallel:
 #        print('parallel')
@@ -112,7 +120,7 @@ def compute_geds(Gn, params_ged, parallel=False):
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
        do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
        iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                        desc='computing GEDs', file=sys.stdout)
@@ -138,26 +146,146 @@ def compute_geds(Gn, params_ged, parallel=False):
                ged_vec.append(dis)
                ged_mat[i][j] = dis
                ged_mat[j][i] = dis
                n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
                n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward)
                n_edit_operations.append(n_eo_tmp)
    return ged_vec, ged_mat, n_edit_operations
 def _wrapper_compute_ged_parallel(params_ged, itr):
 def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr):
    i = itr[0]
    j = itr[1]
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo)
    return i, j, dis, n_eo_tmp
 def _compute_ged_parallel(g1, g2, params_ged):
 def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo):
    dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
    n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)       
    n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0]
    return dis, n_eo_tmp
 def update_costs(nb_cost_mat, dis_k_vec):
 def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', 
                 cost='CONSTANT', rw_constraints='2constraints'):
    if dataset.lower() == 'letter':
        if cost == 'LETTER':            
            pass
 #        # method 1: set alpha automatically, just tune c_vir and c_eir by 
 #        # LMS using cvxpy.
 #        alpha = 0.5
 #        coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
 ##        if np.count_nonzero(nb_cost_mat[:,4]) == 0:
 ##            alpha = 0.75
 ##        else:
 ##            alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
 ##        alpha = alpha * 0.99
 #        param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
 #        param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
 #        nb_cost_mat_new = np.column_stack((param_vir, param_eir))
 #        dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
 #        
 #        x = cp.Variable(nb_cost_mat_new.shape[1])
 #        cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
 #        constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 #        prob = cp.Problem(cp.Minimize(cost), constraints)
 #        prob.solve()
 #        edit_costs_new = x.value
 #        edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
 #        residual = np.sqrt(prob.value)
 #        # method 2: tune c_vir, c_eir and alpha by nonlinear programming by 
 #        # scipy.optimize.minimize.
 #        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
 #        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
 #        w2 = nb_cost_mat[:,3]
 #        w3 = dis_k_vec
 #        func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
 #                             + w2 * x[2] - w3 * x[3]) ** 2)
 #        bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
 #        res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
 #        edit_costs_new = res.x[0:3]
 #        residual = res.fun
        # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
 #        # method 4: tune c_vir, c_eir and alpha by QP function
 #        # scipy.optimize.least_squares. An initial guess is required.
 #        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
 #        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
 #        w2 = nb_cost_mat[:,3]
 #        w3 = dis_k_vec
 #        func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
 #                             + w2 * x[2] - w3 * x[3]) ** 2
 #        res = optimize.root(func, [0.9, 1.7, 0.75, 100])
 #        edit_costs_new = res.x
 #        residual = None
        elif cost == 'LETTER2':
 #            # 1. if c_vi != c_vr, c_ei != c_er.
 #            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
 #            x = cp.Variable(nb_cost_mat_new.shape[1])
 #            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 ##            # 1.1 no constraints.
 ##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 #            # 1.2 c_vs <= c_vi + c_vr.
 #            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 #                           np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]            
 ##            # 2. if c_vi == c_vr, c_ei == c_er.
 ##            nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
 ##            nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
 ##            nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
 ##            x = cp.Variable(nb_cost_mat_new.shape[1])
 ##            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 ##            # 2.1 no constraints.
 ##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 ###            # 2.2 c_vs <= c_vi + c_vr.
 ###            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 ###                           np.array([2.0, -1.0, 0.0]).T@x >= 0.0]     
 #            
 #            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
 #            prob.solve()
 #            edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
 #            edit_costs_new = np.array(edit_costs_new)
 #            residual = np.sqrt(prob.value)
            if rw_constraints == 'inequality':
                # c_vs <= c_vi + c_vr.
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = x.value
                residual = np.sqrt(prob.value)
            elif rw_constraints == '2constraints':
                # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
                               np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
                               np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = x.value
                residual = np.sqrt(prob.value)
 #            elif method == 'inequality_modified':
 #                # c_vs <= c_vi + c_vr.
 #                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
 #                x = cp.Variable(nb_cost_mat_new.shape[1])
 #                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 #                constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 #                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
 #                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
 #                prob.solve()
 #                # use same costs for insertion and removal rather than the fitted costs.
 #                edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
 #                edit_costs_new = np.array(edit_costs_new)
 #                residual = np.sqrt(prob.value)
    else:
 #    # method 1: simple least square method.
 #    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
 #                                                     rcond=None)
@@ -181,16 +309,16 @@ def update_costs(nb_cost_mat, dis_k_vec):
 #    G = -1 * np.identity(nb_cost_mat.shape[1])
 #    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
    x = cp.Variable(nb_cost_mat.shape[1])
    cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
    constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
 #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                   np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                   np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
    prob = cp.Problem(cp.Minimize(cost), constraints)
    prob.solve()
    edit_costs_new = x.value
    residual = np.sqrt(prob.value)
        x = cp.Variable(nb_cost_mat.shape[1])
        cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
        constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
    #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                       np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                       np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
        prob = cp.Problem(cp.Minimize(cost_fun), constraints)
        prob.solve()
        edit_costs_new = x.value
        residual = np.sqrt(prob.value)
    # method 4: 
--- a/preimage/ged.py
+++ b/preimage/ged.py
@@ -13,33 +13,46 @@ import multiprocessing
 from multiprocessing import Pool
 from functools import partial
 from gedlibpy_linlin import librariesImport, gedlibpy
 #from gedlibpy_linlin import librariesImport, gedlibpy
 from libs import *
 def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', 
 def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', 
        edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
    """
    Compute GED for 2 graphs.
    """
    def convertGraph(G):
    def convertGraph(G, dataset):
        """Convert a graph to the proper NetworkX format that can be
        recognized by library gedlibpy.
        """
        G_new = nx.Graph()
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd), chem=attrs['atom'])
 #                G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
 #                               y=str(attrs['attributes'][1]))
        for nd1, nd2, attrs in G.edges(data=True):
 #            G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
            G_new.add_edge(str(nd1), str(nd2))
        if dataset == 'monoterpenoides':
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), chem=attrs['atom'])
            for nd1, nd2, attrs in G.edges(data=True):
                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
        elif dataset == 'letter':   
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
                               y=str(attrs['attributes'][1]))
            for nd1, nd2, attrs in G.edges(data=True):
                G_new.add_edge(str(nd1), str(nd2))
        else:
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), chem=attrs['atom'])
            for nd1, nd2, attrs in G.edges(data=True):
                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #                G_new.add_edge(str(nd1), str(nd2))
        return G_new
    dataset = dataset.lower()
    if lib == 'gedlibpy':
        gedlibpy.restart_env()
        gedlibpy.add_nx_graph(convertGraph(g1), "")
        gedlibpy.add_nx_graph(convertGraph(g2), "")
        gedlibpy.add_nx_graph(convertGraph(g1, dataset), "")
        gedlibpy.add_nx_graph(convertGraph(g2, dataset), "")
        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
@@ -310,6 +323,60 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map):
        if map_i == np.inf:
            n_vi += 1
 #    idx_nodes1 = range(0, len(node1))
    edges1 = [e for e in g1.edges()]
    nb_edges2_cnted = 0
    for n1, n2 in edges1:
        idx1 = nodes1.index(n1)
        idx2 = nodes1.index(n2)
        # one of the nodes is removed, thus the edge is removed.
        if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
            n_er += 1
        # corresponding edge is in g2.
        elif (forward_map[idx1], forward_map[idx2]) in g2.edges():
            nb_edges2_cnted += 1
            # edge labels are different.
            if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \
                != g1.edges[(n1, n2)]['bond_type']:
                    n_es += 1
        elif (forward_map[idx2], forward_map[idx1]) in g2.edges():
            nb_edges2_cnted += 1
            # edge labels are different.
            if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \
                != g1.edges[(n1, n2)]['bond_type']:
                    n_es += 1                
        # corresponding nodes are in g2, however the edge is removed.
        else:
            n_er += 1
    n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
    return n_vi, n_vr, n_vs, n_ei, n_er, n_es
 def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
    """Compute the number of each edit operations.
    """
    n_vi = 0
    n_vr = 0
    n_vs = 0
    sod_vs = 0
    n_ei = 0
    n_er = 0
    nodes1 = [n for n in g1.nodes()]
    for i, map_i in enumerate(forward_map):
        if map_i == np.inf:
            n_vr += 1
        else:
            n_vs += 1
            diff_x = float(g1.nodes[i]['x']) - float(g2.nodes[map_i]['x'])
            diff_y = float(g1.nodes[i]['y']) - float(g2.nodes[map_i]['y'])
            sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y))
    for map_i in backward_map:
        if map_i == np.inf:
            n_vi += 1
 #    idx_nodes1 = range(0, len(node1))
    edges1 = [e for e in g1.edges()]
@@ -329,4 +396,8 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map):
            n_er += 1
    n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
    return n_vi, n_vr, n_vs, n_ei, n_er, n_es
    return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er
 if __name__ == '__main__':
    print('check test_ged.py')
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -436,7 +436,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
    return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
 def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
 def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', 
             dataset='monoterpenoides',
             graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
    """Compute the iam by c++ implementation (gedlib) through bash.
    """
@@ -467,12 +468,12 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
 #    graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
    command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
    command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n'
    command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
    command += 'export LD_LIBRARY_PATH\n'
    command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
    command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
            + ' \'' + graph_dir + '\' '
            + ' \'' + graph_dir + '\' ' + ' ' + cost + ' '
    if edit_cost_constant is None:
        command += 'None'
    else:
@@ -484,7 +485,7 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
    output = stream.readlines()    
 #    print(output)
    sod_sm = float(output[0].strip())
    sod_gm= float(output[1].strip())
    sod_gm = float(output[1].strip())
    fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
    fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
--- a/preimage/test_k_closest_graphs.py
+++ b/preimage/test_k_closest_graphs.py
@@ -31,8 +31,9 @@ from fitDistance import fit_GED_to_kernel_distance
 def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method,
                               graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', 
                               edit_costs=None, group_min=None, dataset='monoterpenoides',
                               parallel=True):
                               cost='CONSTANT', parallel=True):
    dataset = dataset.lower()
 #    # compute distances in kernel space.
 #    dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
 #                                              Kmatrix=None, gkernel=gkernel)
@@ -50,32 +51,53 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho
 #    group_min = (12, 13, 22, 29) # closest w.r.t path kernel
 #    group_min = (77, 85, 160, 171) # closest w.r.t ged
 #    group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel
    Gn_median = [Gn[g].copy() for g in group_min]
    # fit edit costs.    
    if fit_method == 'random': # random
        edit_cost_constant = random.sample(range(1, 10), 6)
        if cost == 'LETTER':
            edit_cost_constant = random.sample(range(1, 10), 3)
            edit_cost_constant = [item * 0.1 for item in edit_cost_constant]
        elif cost == 'LETTER2':
            random.seed(time.time())
            edit_cost_constant = random.sample(range(1, 10), 5)
 #            edit_cost_constant = [item * 0.1 for item in edit_cost_constant]
        else:
            edit_cost_constant = random.sample(range(1, 10), 6)
        print('edit costs used:', edit_cost_constant)
    elif fit_method == 'expert': # expert
        edit_cost_constant = [3, 3, 1, 3, 3, 1]
    elif fit_method == 'k-graphs':
        itr_max = 6
        algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
                    'algo_options': algo_options, 'stabilizer': None}
        if cost == 'LETTER':
            init_costs = [0.9, 1.7, 0.75] 
        elif cost == 'LETTER2':
            init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
        else:
            init_costs = [3, 3, 1, 3, 3, 1] 
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', 
                      'algo_options': algo_options, 'stabilizer': None}
        # fit on k-graph subset
        edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, 
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True)
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, 
                init_costs=init_costs, dataset=dataset, parallel=True)
    elif fit_method == 'whole-dataset':
        itr_max = 6
        algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
        if cost == 'LETTER':
            init_costs = [0.9, 1.7, 0.75] 
        elif cost == 'LETTER2':
            init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
        else:
            init_costs = [3, 3, 1, 3, 3, 1] 
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', 
                    'algo_options': algo_options, 'stabilizer': None}
        # fit on all subset
        edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, 
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True)
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, 
                init_costs=init_costs, dataset=dataset, parallel=True)
    elif fit_method == 'precomputed':
        edit_cost_constant = edit_costs
@@ -83,14 +105,17 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho
    # compute set median and gen median using IAM (C++ through bash).
    group_fnames = [Gn[g].graph['filename'] for g in group_min]
    sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant,
                                                  graph_dir=graph_dir, dataset=dataset)
                                                  cost=cost, graph_dir=graph_dir, 
                                                  dataset=dataset)
    # compute distances in kernel space.
    Gn_median = [Gn[g].copy() for g in group_min]
    set_median = loadGXL(fname_sm)
    gen_median = loadGXL(fname_gm)
    if dataset == 'Letter':
 #    print(gen_median.nodes(data=True))
 #    print(gen_median.edges(data=True))
    if dataset == 'letter':
        for g in Gn_median:
            reform_attributes(g)
        reform_attributes(set_median)
@@ -98,16 +123,19 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho
    # compute distance in kernel space for set median.    
    Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, 
                                None if dataset == 'Letter' else 'chem', 
                                None if dataset == 'Letter' else 'valence', 
                                None if dataset == 'letter' else 'chem', 
                                None if dataset == 'letter' else 'valence', 
                                False)
    dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), 
                         [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False)
 #    print(gen_median.nodes(data=True))
 #    print(gen_median.edges(data=True))
 #    print(set_median.nodes(data=True))
 #    print(set_median.edges(data=True))
    # compute distance in kernel space for generalized median.
    Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, 
                                None if dataset == 'Letter' else 'chem', 
                                None if dataset == 'Letter' else 'valence', 
                                None if dataset == 'letter' else 'chem', 
                                None if dataset == 'letter' else 'valence', 
                                False)
    dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), 
                         [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)
--- a/preimage/utils.py
+++ b/preimage/utils.py
@@ -61,8 +61,8 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
                              {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
                              n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'treeletkernel':
 #        pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
        pkernel = functools.partial(gaussiankernel, gamma=1e-6)
        pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
 #        pkernel = functools.partial(gaussiankernel, gamma=1e-6)
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
                                   sub_kernel=pkernel,
--- a/preimage/xp_letter_h.py
+++ b/preimage/xp_letter_h.py
@@ -19,11 +19,13 @@ from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_at
 from preimage.utils import get_same_item_indices
 from preimage.find_best_k import getRelations
 def xp_letter_h():
    ds = {'name': 'Letter-high', 
          'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
 def xp_letter_h_LETTER2_cost():
    ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
          'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
    for G in Gn:
        reform_attributes(G)
 #    ds = {'name': 'Letter-high', 
 #          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'])
@@ -33,32 +35,35 @@ def xp_letter_h():
    edge_label = None
    ds_name = 'letter-h'
    dir_output = 'results/xp_letter_h/'
    save_results = True
    cost = 'LETTER2'
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [150]
    fit_method = 'precomputed'
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    # create result files.
    fn_output_detail = 'results_detail.' + fit_method + '.csv'
    f_detail = open(dir_output + fn_output_detail, 'a')
    csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', 'median set'])
    f_detail.close()
    fn_output_summary = 'results_summary.csv'
    f_summary = open(dir_output + fn_output_summary, 'a')
    csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
              '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
              'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
              'repeats better dis_k gi -> GM'])
    f_summary.close()
    if save_results:
        # create result files.
        fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_detail = open(dir_output + fn_output_detail, 'a')
        csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', 'median set'])
        f_detail.close()
        fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
                  '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
                  'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
                  'repeats better dis_k gi -> GM'])
        f_summary.close()
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
@@ -82,11 +87,11 @@ def xp_letter_h():
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
 #            y = 'I'
 #            y = 'F'
 #            values = y_idx[y]
 #            values = values[0:10]
 #            k = len(values)
 #            k = kkk
            k = len(values)
            sod_sm_list = []
            sod_gm_list = []
@@ -114,20 +119,21 @@ def xp_letter_h():
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset='Letter', parallel=False)
                        dataset='Letter', cost=cost, parallel=False)
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                f_detail = open(dir_output + fn_output_detail, 'a')
                csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                          y, repeat,
                          sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                          dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                          dis_k_gi2gm, median_set_idx])
                f_detail.close()
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                              y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, median_set_idx])
                    f_detail.close()
                # compute result summary.
                sod_sm_list.append(sod_sm)
@@ -170,14 +176,17 @@ def xp_letter_h():
                # save median graphs.
                fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
                # plot median graphs.
@@ -197,16 +206,17 @@ def xp_letter_h():
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                      sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                      dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                      dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                      nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                      repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                      repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
            f_summary.close()
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
        # write result summary for each letter. 
@@ -219,13 +229,232 @@ def xp_letter_h():
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean])
            f_summary.close()
    print('\ncomplete.')
 def xp_letter_h():
    ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
          'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
    for G in Gn:
        reform_attributes(G)
 #    ds = {'name': 'Letter-high', 
 #          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'structuralspkernel'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    dir_output = 'results/xp_letter_h/'
    save_results = False
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [150]
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    if save_results:
        # create result files.
        fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_detail = open(dir_output + fn_output_detail, 'a')
        csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', 'median set'])
        f_detail.close()
        fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                  sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                  dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                  dis_k_gi2sm_mean, dis_k_gi2gm_mean])
        csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
                  '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
                  'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
                  'repeats better dis_k gi -> GM'])
        f_summary.close()
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    for k in k_list:
        print('\n--------- k =', k, '----------')
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
 #        nb_sod_sm2gm = [0, 0, 0]
 #        nb_dis_k_sm2gm = [0, 0, 0]
 #        nb_dis_k_gi2sm = [0, 0, 0]
 #        nb_dis_k_gi2gm = [0, 0, 0]
 #        repeats_better_sod_sm2gm = []
 #        repeats_better_dis_k_sm2gm = []
 #        repeats_better_dis_k_gi2sm = []
 #        repeats_better_dis_k_gi2gm = []
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
 #            y = 'N'
 #            values = y_idx[y]
 #            values = values[0:10]
            k = len(values)
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset='Letter', parallel=False)
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                              y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, median_set_idx])
                    f_detail.close()
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                # save median graphs.
                fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
                # plot median graphs.
                set_median = loadGXL(fn_pre_sm_new + '.gxl')
                gen_median = loadGXL(fn_pre_gm_new + '.gxl')
                draw_Letter_graph(set_median, fn_pre_sm_new)
                draw_Letter_graph(gen_median, fn_pre_gm_new)
                draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean])
            f_summary.close()
    print('\ncomplete.')
@@ -243,4 +472,5 @@ def draw_Letter_graph(graph, file_prefix):
 if __name__ == "__main__":
    xp_letter_h()
 #    xp_letter_h()
    xp_letter_h_LETTER2_cost()