| @@ -0,0 +1,11 @@ | |||
| # Fit Distances | |||
| # Run xp: | |||
| ``` | |||
| python3 -m pip install graphkit-learn | |||
| python3 run_xp.py | |||
| ``` | |||
| # Run xp (deprecated). | |||
| export PYTHONPATH="/path/to/gedlibpy:/path/to/py-graph" | |||
| python optim_costs.py dataset output_file | |||
| @@ -0,0 +1,43 @@ | |||
| import numpy as np | |||
| def sum_squares(a, b): | |||
| """ | |||
| Return the sum of squares of the difference between a and b, aka MSE | |||
| """ | |||
| return np.sum([(a[i] - b[i])**2 for i in range(len(a))]) | |||
| def euclid_d(x, y): | |||
| """ | |||
| 1D euclidean distance | |||
| """ | |||
| return np.sqrt((x-y)**2) | |||
| def man_d(x, y): | |||
| """ | |||
| 1D manhattan distance | |||
| """ | |||
| return np.abs((x-y)) | |||
| def classif_d(x, y): | |||
| """ | |||
| Function adapted to classification problems | |||
| """ | |||
| return np.array(0 if x == y else 1) | |||
| def rmse(pred, ground_truth): | |||
| import numpy as np | |||
| return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth)) | |||
| def accuracy(pred, ground_truth): | |||
| import numpy as np | |||
| return np.mean([a == b for a, b in zip(pred, ground_truth)]) | |||
| def rbf_k(D, sigma=1): | |||
| return np.exp(-(D**2)/sigma) | |||
| @@ -0,0 +1,85 @@ | |||
| from distances import euclid_d | |||
| from gklearn.ged.util import pairwise_ged, get_nb_edit_operations | |||
| from gklearn.utils import get_iters | |||
| import sys | |||
| def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs): | |||
| """ | |||
| Compute GED between two graph according to edit_cost | |||
| """ | |||
| ged_options = {'edit_cost': 'CONSTANT', | |||
| 'method': method, | |||
| 'edit_cost_constants': edit_cost} | |||
| node_labels = kwargs.get('node_labels', []) | |||
| edge_labels = kwargs.get('edge_labels', []) | |||
| dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) | |||
| n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels) | |||
| return dis, n_eo_tmp | |||
| def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs): | |||
| N = len(Gn) | |||
| G_pairs = [] | |||
| for i in range(N): | |||
| for j in range(i, N): | |||
| G_pairs.append([i, j]) | |||
| return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs) | |||
| def compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs): | |||
| """ | |||
| Compute GED between all indexes in G_pairs given edit_cost | |||
| :return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations | |||
| """ | |||
| ged_vec = [] | |||
| n_edit_operations = [] | |||
| for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs)): | |||
| [i, j] = G_pairs[k] | |||
| dis, n_eo_tmp = compute_ged( | |||
| Gn[i], Gn[j], edit_cost = edit_cost, method=ed_method, **kwargs) | |||
| ged_vec.append(dis) | |||
| n_edit_operations.append(n_eo_tmp) | |||
| return ged_vec, n_edit_operations | |||
| def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs): | |||
| import numpy as np | |||
| N = len(G_app) | |||
| D_app = np.zeros((N, N)) | |||
| for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N): | |||
| for j, G2 in enumerate(G_app[i+1:], i+1): | |||
| D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) | |||
| D_app[j, i] = D_app[i, j] | |||
| if (G_test is None): | |||
| return D_app, edit_cost | |||
| else: | |||
| D_test = np.zeros((len(G_test), N)) | |||
| for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)): | |||
| for j, G2 in enumerate(G_app): | |||
| D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) | |||
| return D_app, D_test, edit_cost | |||
| def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): | |||
| import numpy as np | |||
| edit_costs = np.random.rand(6) | |||
| return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs) | |||
| def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): | |||
| edit_cost = [3, 3, 1, 3, 3, 1] | |||
| return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs) | |||
| def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d, | |||
| mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): | |||
| from optim_costs import compute_optimal_costs | |||
| costs_optim = compute_optimal_costs( | |||
| G_app, y_app, y_distance=y_distance, | |||
| mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs) | |||
| return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) | |||
| @@ -0,0 +1,391 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Dec 31 10:42:55 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import numpy as np | |||
| import scipy.stats | |||
| import matplotlib.pyplot as plt | |||
| import matplotlib.gridspec as gridspec | |||
| def rounder(x, decimals): | |||
| x_strs = str(x).split('.') | |||
| if len(x_strs) == 2: | |||
| before = x_strs[0] | |||
| after = x_strs[1] | |||
| if len(after) > decimals: | |||
| if int(after[decimals]) >= 5: | |||
| after0s = '' | |||
| for c in after: | |||
| if c == '0': | |||
| after0s += '0' | |||
| elif c != '0': | |||
| break | |||
| after = after0s + str(int(after[0:decimals]) + 1)[-decimals:] | |||
| else: | |||
| after = after[0:decimals] | |||
| elif len(after) < decimals: | |||
| after += '0' * (decimals - len(after)) | |||
| return before + '.' + after | |||
| elif len(x_strs) == 1: | |||
| return x_strs[0] | |||
| def df_to_latex_table(df, replace_header=True, end_mid_line=7): | |||
| ltx = df.to_latex(index=True, escape=False, multirow=True) | |||
| # modify middle lines. | |||
| end_mid_line = str(end_mid_line) | |||
| ltx = ltx.replace('\\cline{1-' + end_mid_line + '}\n\\cline{2-' + end_mid_line + '}', '\\toprule') | |||
| ltx = ltx.replace('\\cline{2-' + end_mid_line + '}', '\\cmidrule(l){2-' + end_mid_line + '}') | |||
| # Reset dataset name. | |||
| ltx = ltx.replace('Alkane_unlabeled', 'Alkane') | |||
| ltx = ltx.replace('Vitamin_D', 'Vitamin\_D') | |||
| # modify header. | |||
| if replace_header: | |||
| i_start = ltx.find('\\begin{tabular}') | |||
| i_end = ltx.find('\\\\\n\\midrule\n') | |||
| replace = r"""\begin{tabular}{lll@{~~}c@{~~}c@{~~}c@{~~}c} | |||
| \toprule | |||
| \multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{BIPARTITE}} & \multicolumn{2}{c}{\textbf{IPFP}} \\ | |||
| \cmidrule(lr){4-5}\cmidrule(lr){6-7} | |||
| & & & \textbf{Train errors} & \textbf{Test errors} & \textbf{Train errors} & \textbf{Test errors} \\ | |||
| \midrule | |||
| """ | |||
| ltx = ltx.replace(ltx[i_start:i_end+12], replace, 1) | |||
| # | |||
| # # add row numbers. | |||
| # ltx = ltx.replace('lllllllll', 'lllllllll|@{\\makebox[2em][r]{\\textit{\\rownumber\\space}}}', 1) | |||
| # ltx = replace_nth(ltx, '\\\\\n', '\\gdef\\rownumber{\\stepcounter{magicrownumbers}\\arabic{magicrownumbers}} \\\\\n', 1) | |||
| return ltx | |||
| def beautify_df(df): | |||
| # df = df.sort_values(by=['Datasets', 'Graph Kernels']) | |||
| # df = df.set_index(['Datasets', 'Graph Kernels', 'Algorithms']) | |||
| # # index = pd.MultiIndex.from_frame(df[['Datasets', 'Graph Kernels', 'Algorithms']]) | |||
| # bold the best results. | |||
| for ds in df.index.get_level_values('Dataset').unique(): | |||
| for gk in df.loc[ds].index.get_level_values('Distance').unique(): | |||
| for label, col in df.loc[(ds, gk)].items(): | |||
| min_val = np.inf | |||
| min_indices = [] | |||
| min_labels = [] | |||
| for index, row in col.items(): | |||
| value = row | |||
| if value != '-': | |||
| mean, interval = value.split('$\\pm$') | |||
| mean = float(mean.strip('/same')) | |||
| if mean < min_val: | |||
| min_val = mean | |||
| min_indices = [index] | |||
| min_labels = [label] | |||
| elif mean == min_val: | |||
| min_indices.append(index) | |||
| min_labels.append(label) | |||
| for idx, index in enumerate(min_indices): | |||
| df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}' | |||
| return df | |||
| def params_to_latex_table(results): | |||
| import pandas as pd | |||
| # Create df table. | |||
| row_indices = pd.MultiIndex.from_product([Dataset_list, Edit_Cost_List, Dis_List], names=['Dataset', 'Edit cost', 'Distance']) | |||
| df = pd.DataFrame(columns=['$c_{ni}$', '$c_{nr}$', '$c_{ns}$', '$c_{ei}$', '$c_{er}$', '$c_{es}$'], index=row_indices) | |||
| # Set data. | |||
| for idx_r, row in df.iterrows(): | |||
| for idx, (idx_c, col) in enumerate(row.items()): | |||
| key = (idx_r[0], idx_r[2], idx_r[1]) | |||
| if key in results and results[key] is not None: | |||
| # if results[key][idx] != 0: | |||
| df.loc[idx_r, idx_c] = results[key][idx] | |||
| # else: | |||
| # df.loc[idx_r, idx_c] = '-' | |||
| else: | |||
| df.loc[idx_r, idx_c] = '-' | |||
| # df = beautify_df(df) | |||
| ltx = df_to_latex_table(df, replace_header=False, end_mid_line=9) | |||
| return ltx | |||
| def results_to_latex_table(results): | |||
| import pandas as pd | |||
| # Create df table. | |||
| col_indices = pd.MultiIndex.from_product([Edit_Cost_List, ['Train errors', 'Test errors']]) | |||
| row_indices = pd.MultiIndex.from_product([Dataset_list, Dis_List, ['random', 'expert', 'fitted']], names=['Dataset', 'Distance', 'Method']) | |||
| df = pd.DataFrame(columns=col_indices, index=row_indices) | |||
| # Set data. | |||
| for idx_r, row in df.iterrows(): | |||
| for idx_c, col in row.items(): | |||
| key = (idx_r[0], idx_r[1], idx_c[0]) | |||
| if key in results and results[key] is not None: | |||
| mean = results[key][idx_r[2]]['mean'] | |||
| mean = mean[0] if idx_c[1] == 'Train errors' else mean[1] | |||
| interval = results[key][idx_r[2]]['interval'] | |||
| interval = interval[0] if idx_c[1] == 'Train errors' else interval[1] | |||
| df.loc[idx_r, idx_c] = rounder(mean, 2) + '$\pm$' + rounder(interval, 2) | |||
| else: | |||
| df.loc[idx_r, idx_c] = '-' | |||
| df = beautify_df(df) | |||
| ltx = df_to_latex_table(df) | |||
| return ltx | |||
| def get_params(results): | |||
| edit_costs = [[] for i in range(6)] | |||
| for result in results['results']: | |||
| ed = result['fitted']['edit_costs'] | |||
| for i, e in enumerate(ed): | |||
| edit_costs[i].append(e) | |||
| for i, ed in enumerate(edit_costs): | |||
| mean, interval = mean_confidence_interval(ed) | |||
| if mean == 0: | |||
| edit_costs[i] = '-' | |||
| else: | |||
| edit_costs[i] = rounder(mean, 2) + '$\pm$' + rounder(interval, 2) | |||
| return edit_costs | |||
| def print_bars(ax, p, title, y_label='RMSE', export_filename=None): | |||
| palette = plt.get_cmap('Set1') # ['red', 'blue', 'green'] | |||
| # width of the bars | |||
| barWidth = 0.1 | |||
| gap = 0.2 | |||
| # The x position of bars | |||
| # nb_xp = len(p.keys()) | |||
| # r = np.arange(2) | |||
| r = [0, gap + barWidth * 3] | |||
| # r = [0 - barWidth, nb_xp * barWidth + gap * 0.5 - barWidth] | |||
| #print(r) | |||
| for i, xp in enumerate(p.keys()): | |||
| bars = p[xp]['mean'] | |||
| y_err = p[xp]['interval'] | |||
| # Create blue bars | |||
| r_cur = [x + barWidth * (i - 1) * 1.03 for x in r] | |||
| plt.bar(r_cur, | |||
| bars, width=barWidth, color=palette(i), | |||
| edgecolor='black', linewidth=0.2, | |||
| yerr=y_err, error_kw=dict(lw=0.5, capsize=3, capthick=0.5), | |||
| label=xp) | |||
| # general layout | |||
| ax.set_xticks(r) | |||
| ax.set_xticklabels(['train', 'test'] ) # ['train errors', 'test errors']) | |||
| ax.xaxis.set_ticks_position('none') | |||
| ax.set_ylabel(y_label) | |||
| # ax.legend() | |||
| ax.set_title(title) | |||
| if (export_filename is not None): | |||
| print(export_filename) | |||
| plt.savefig(export_filename) | |||
| def print_table_results(results_by_xp): | |||
| from tabulate import tabulate | |||
| tab = [] | |||
| tab.append(["Method", "App","Test"]) | |||
| #setups = ["random","expert","fitted"] | |||
| for i,setup in enumerate(results_by_xp.keys()): | |||
| current_line = [setup] | |||
| p = results_by_xp[setup] | |||
| current_line.append(f"{p['mean'][0]:.2f} +- {p['interval'][0]:.2f}") | |||
| current_line.append(f"{p['mean'][1]:.2f} +- {p['interval'][1]:.2f}") | |||
| tab.append(current_line) | |||
| print(tabulate(tab, headers="firstrow")) | |||
| def mean_confidence_interval(data, confidence=0.95): | |||
| a = 1.0 * np.array(data) | |||
| n = len(a) | |||
| m, se = np.mean(a), scipy.stats.sem(a) | |||
| h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) | |||
| return m, h | |||
| def compute_perf(results, app_or_test): | |||
| return mean_confidence_interval(results[app_or_test]) | |||
| def compute_displayable_results(results_by_xp): | |||
| p = {} | |||
| for xp in results_by_xp.keys(): | |||
| p[xp] = {} | |||
| p[xp]["mean"] = [0] * 2 | |||
| p[xp]["interval"] = [0] * 2 | |||
| p[xp]["mean"][0], p[xp]["interval"][0] = compute_perf(results_by_xp[xp], 'app') | |||
| p[xp]["mean"][1], p[xp]["interval"][1] = compute_perf(results_by_xp[xp], 'test') | |||
| return p | |||
| def organize_results_by_cost_settings(results, xps): | |||
| all_results = results["results"] | |||
| results_by_xp = {} | |||
| for xp in xps: | |||
| results_xp = { | |||
| 'app' :[], | |||
| 'test' : [] | |||
| } | |||
| for i, split_res in enumerate(all_results): | |||
| results_xp['app'].append(split_res[xp]['perf_app']) | |||
| results_xp['test'].append(split_res[xp]['perf_test']) | |||
| results_by_xp[xp] = results_xp | |||
| return results_by_xp | |||
| def plot_a_task(ax, ds_name, edit_cost, distance, title, y_label): | |||
| # Load data. | |||
| root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/thesis/ged/fit_distances/outputs/' | |||
| fn = root_dir + 'results.' + '.'.join([ds_name, edit_cost, distance]) + '.pkl' | |||
| if os.path.isfile(fn): | |||
| with open(fn, 'rb') as file: | |||
| results = pickle.load(file) | |||
| else: | |||
| return None, None | |||
| # print(results.keys()) | |||
| # print(results['y_distance']) | |||
| # print(results['dataset']) | |||
| # print(results['params']) | |||
| # #print(results['mode']) | |||
| # print(len(results['results'])) | |||
| # len(results['results'][0]) | |||
| # print(results['results'][0].keys()) | |||
| # ### Schema Xp | |||
| # # acyclic_results['results'] est une liste qui contient les resultats de test et train/valid sur 10 split randoms. | |||
| # # Pour chaque split, results['results'][i] est un dict qui contient chaque xp avec le split i | |||
| # print(results["results"][0]['random'].keys()) | |||
| # xp = results["results"][4]['fitted'] | |||
| # for k in xp.keys(): | |||
| # print(f"{k} : {xp[k]}") | |||
| # i=4 | |||
| # print(results["results"][i]['random']['perf_test']) | |||
| # print(results["results"][i]['expert']['perf_test']) | |||
| # print(results["results"][i]['fitted']['perf_test']) | |||
| # #print(xp['clf'].cv_results_) | |||
| # Compute data. | |||
| xps = ["random", "expert", "fitted"] | |||
| results_by_xp = organize_results_by_cost_settings(results, xps) | |||
| p = compute_displayable_results(results_by_xp) | |||
| # print_bars(p,'KNN with CV and y_distance = {0}'.format(results['y_distance']),export_filename=export_filename) | |||
| print_bars(ax, p, title, y_label=y_label, export_filename=None) | |||
| c = get_params(results) | |||
| return p, c | |||
| def set_figure(nb_rows): | |||
| #plt.rc('font', size=SMALL_SIZE) # controls default text sizes | |||
| # plt.rc('axes', titlesize=15) # fontsize of the axes title | |||
| # plt.rc('axes', labelsize=15) # fontsize of the x and y labels | |||
| # plt.rc('xtick', labelsize=15) # fontsize of the tick labels | |||
| # plt.rc('ytick', labelsize=15) # fontsize of the tick labels | |||
| # plt.rc('legend', fontsize=15) # legend fontsize | |||
| # plt.rc('figure', titlesize=15) # fontsize of the figure title | |||
| #fig, _ = plt.subplots(2, 2, figsize=(13, 12)) | |||
| #ax1 = plt.subplot(221) | |||
| #ax2 = plt.subplot(222) | |||
| #ax3 = plt.subplot(223) | |||
| #ax4 = plt.subplot(224) | |||
| fig = plt.figure(figsize=(11, 2.12 * nb_rows + 0.56)) | |||
| ax = fig.add_subplot(111) # The big subplot for common labels | |||
| # Turn off axis lines and ticks of the big subplot | |||
| ax.spines['top'].set_color('none') | |||
| ax.spines['bottom'].set_color('none') | |||
| ax.spines['left'].set_color('none') | |||
| ax.spines['right'].set_color('none') | |||
| ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') | |||
| ax.xaxis.set_ticks_position('none') | |||
| ax.yaxis.set_ticks_position('none') | |||
| # Set common labels | |||
| #ax.set_xlabel('accuracy(%)') | |||
| ax.yaxis.set_label_coords(-0.105, 0.5) | |||
| ax.set_ylabel('RMSE') | |||
| ax.yaxis.set_label_coords(-0.07, 0.5) | |||
| return fig | |||
| if __name__ == '__main__': | |||
| from sklearn.model_selection import ParameterGrid | |||
| import pickle | |||
| # Get task grid. | |||
| Edit_Cost_List = ['BIPARTITE', 'IPFP'] | |||
| Dataset_list = ['Alkane_unlabeled', 'Acyclic', 'Chiral', 'Vitamin_D', | |||
| 'Steroid'][0:2] | |||
| Dis_List = ['euclidean', 'manhattan'] | |||
| # row_grid = ParameterGrid({'edit_cost': Edit_Cost_List[0:], | |||
| # 'distance': Dis_List[0:]}) | |||
| # show by edit costs then by distances. | |||
| row_grid_list = [] | |||
| for i in Edit_Cost_List[0:]: | |||
| for j in Dis_List[0:]: | |||
| row_grid_list.append({'edit_cost': i, 'distance': j}) | |||
| # Compute and plot. | |||
| fig = set_figure(len(Dataset_list)) | |||
| gs = gridspec.GridSpec(len(Dataset_list), len(row_grid_list)) | |||
| gs.update(hspace=0.3) | |||
| results = {} | |||
| params = {} | |||
| for row, ds_name in enumerate(Dataset_list): | |||
| for col, contents in enumerate(row_grid_list): | |||
| ax = fig.add_subplot(gs[row, col]) | |||
| y_label = (ds_name[:-10] if ds_name.endswith('_unlabeled') else ds_name) if col == 0 else '' | |||
| title = contents['edit_cost'] + ', ' + contents['distance'] if row == 0 else '' | |||
| p, c = plot_a_task(ax, ds_name, contents['edit_cost'], contents['distance'], title, y_label) | |||
| results[(ds_name, contents['distance'], contents['edit_cost'])] = p | |||
| params[(ds_name, contents['distance'], contents['edit_cost'])] = c | |||
| if col == 0 and row == 0: | |||
| handles, labels = ax.get_legend_handles_labels() | |||
| # Show graphic | |||
| size = fig.get_size_inches() | |||
| fig.subplots_adjust(bottom=0.56 / size[1]) | |||
| fig.legend(handles, labels, loc='lower center', ncol=3, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
| plt.savefig('ged_fit_distance_results.eps', format='eps', dpi=300, transparent=True, | |||
| bbox_inches='tight') | |||
| plt.show() | |||
| # Convert results to latex table. | |||
| ltable_perf = results_to_latex_table(results) | |||
| ltable_params = params_to_latex_table(params) | |||
| print(ltable_perf) | |||
| @@ -0,0 +1,108 @@ | |||
| from distances import euclid_d | |||
| def split_data(D, y, train_index, test_index): | |||
| D_app = [D[i] for i in train_index] | |||
| D_test = [D[i] for i in test_index] | |||
| y_app = [y[i] for i in train_index] | |||
| y_test = [y[i] for i in test_index] | |||
| return D_app, D_test, y_app, y_test | |||
| def evaluate_D(D_app, y_app, D_test, y_test, mode='reg'): | |||
| from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier | |||
| from distances import rmse, accuracy | |||
| from sklearn.model_selection import GridSearchCV | |||
| if (mode == 'reg'): | |||
| knn = KNeighborsRegressor(metric='precomputed') | |||
| scoring = 'neg_root_mean_squared_error' | |||
| perf_eval = rmse | |||
| else: | |||
| knn = KNeighborsClassifier(metric='precomputed') | |||
| scoring = 'accuracy' | |||
| perf_eval = accuracy | |||
| grid_params = { | |||
| 'n_neighbors': [3, 5, 7, 9, 11] | |||
| } | |||
| clf = GridSearchCV(knn, param_grid=grid_params, | |||
| scoring=scoring, | |||
| cv=5, return_train_score=True, refit=True) | |||
| clf.fit(D_app, y_app) | |||
| y_pred_app = clf.predict(D_app) | |||
| y_pred_test = clf.predict(D_test) | |||
| return perf_eval(y_pred_app, y_app), perf_eval(y_pred_test, y_test), clf | |||
| def xp_knn(Gn, y_all, y_distance=euclid_d, | |||
| mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): | |||
| ''' | |||
| Perform a knn regressor on given dataset | |||
| ''' | |||
| from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit | |||
| from ged import compute_D_random, compute_D_expert | |||
| from ged import compute_D_fitted | |||
| stratified = False | |||
| if mode == 'classif': | |||
| stratified = True | |||
| if stratified: | |||
| rs = StratifiedShuffleSplit(n_splits=10, test_size=.1) | |||
| else: | |||
| rs = ShuffleSplit(n_splits=10, test_size=.1) | |||
| if stratified: | |||
| split_scheme = rs.split(Gn, y_all) | |||
| else: | |||
| split_scheme = rs.split(Gn) | |||
| results = [] | |||
| i = 1 | |||
| for train_index, test_index in split_scheme: | |||
| print() | |||
| print("Split {0}/{1}".format(i, 10)) | |||
| i = i + 1 | |||
| cur_results = {} | |||
| # Get splitted data | |||
| G_app, G_test, y_app, y_test = split_data(Gn, y_all, | |||
| train_index, test_index) | |||
| cur_results['y_app'] = y_app | |||
| cur_results['y_test'] = y_test | |||
| # Feed distances will all methods to compare | |||
| distances = {} | |||
| distances['random'] = compute_D_random(G_app, G_test, ed_method, **kwargs) | |||
| distances['expert'] = compute_D_expert(G_app, G_test, ed_method, **kwargs) | |||
| distances['fitted'] = compute_D_fitted( | |||
| G_app, y_app, G_test, | |||
| y_distance=y_distance, | |||
| mode=mode, unlabeled=unlabeled, ed_method=ed_method, | |||
| **kwargs) | |||
| for setup in distances.keys(): | |||
| print("{0} Mode".format(setup)) | |||
| setup_results = {} | |||
| D_app, D_test, edit_costs = distances[setup] | |||
| setup_results['D_app'] = D_app | |||
| setup_results['D_test'] = D_test | |||
| setup_results['edit_costs'] = edit_costs | |||
| print(edit_costs) | |||
| perf_app, perf_test, clf = evaluate_D( | |||
| D_app, y_app, D_test, y_test, mode) | |||
| setup_results['perf_app'] = perf_app | |||
| setup_results['perf_test'] = perf_test | |||
| setup_results['clf'] = clf | |||
| print( | |||
| "Learning performance with {1} costs : {0:.2f}".format( | |||
| perf_app, setup)) | |||
| print( | |||
| "Test performance with {1} costs : {0:.2f}".format( | |||
| perf_test, setup)) | |||
| cur_results[setup] = setup_results | |||
| results.append(cur_results) | |||
| return results | |||
| @@ -0,0 +1,66 @@ | |||
| def loglik(X, y, w): | |||
| import numpy as np | |||
| return np.sum(-y*(X@w) + np.log(1+np.exp(X@w))) | |||
| def reg_log(X, y, ite_max=100, lbd=1e-12, pos_contraint=False): | |||
| """ | |||
| y \in 1,0 | |||
| """ | |||
| import numpy as np | |||
| def proj_on_pos(w): | |||
| return np.array([x if x > 0 else 0 for x in w]) | |||
| tol = 1e-4 | |||
| N, d = X.shape | |||
| y = np.array(y) | |||
| w = np.zeros(d) # see 4.4 of ESLII | |||
| weights = [w] | |||
| J = [loglik(X, y, w)] | |||
| # print(f"J[0] = {J[0]}") | |||
| old_J = J[0] + 1 | |||
| conv = False | |||
| i = 0 | |||
| while(not conv): | |||
| i = i + 1 | |||
| Xw = X @ w | |||
| p = np.exp(Xw)/(1+np.exp(Xw)) | |||
| W = np.diag(p) | |||
| regul = lbd*np.identity(d) | |||
| descent = np.linalg.solve(X.T @ W @ X + regul, X.T@(y-p)) | |||
| # print(f"descent: {descent}") | |||
| step = 1 | |||
| update = 0.1 | |||
| cur_w = w+step*descent | |||
| if pos_contraint: | |||
| cur_w = proj_on_pos(cur_w) | |||
| # print(f"cur_w : {cur_w}") | |||
| # print(f"J : {loglik(X,y,cur_w)}") | |||
| while (loglik(X, y, cur_w) > J[-1]): | |||
| step = step*update | |||
| cur_w = w + step*descent | |||
| if pos_contraint: | |||
| cur_w = proj_on_pos(cur_w) | |||
| # print(f"step : {step}") | |||
| w = cur_w | |||
| J.append(loglik(X, y, w)) | |||
| weights.append(w) | |||
| if (i > ite_max): | |||
| conv = True | |||
| if ((old_J - J[-1]) < tol): | |||
| conv = True | |||
| else: | |||
| old_J = J[-1] | |||
| return w, J, weights | |||
| @@ -0,0 +1,136 @@ | |||
| from ged import compute_geds | |||
| from distances import sum_squares, euclid_d | |||
| import numpy as np | |||
| # from tqdm import tqdm | |||
| import sys | |||
| # sys.path.insert(0, "../") | |||
| def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec): | |||
| """ | |||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||
| ! take care that nb_cost_mat do not contains 0 lines | |||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||
| :param dis_k_vec: The N distances to fit | |||
| """ | |||
| import cvxpy as cp | |||
| import numpy as np | |||
| MAX_SAMPLE = 1000 | |||
| nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat]) | |||
| dis_k_vec = np.array(dis_k_vec) | |||
| # dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec) | |||
| # import pickle | |||
| # pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb')) | |||
| N = nb_cost_mat_m.shape[0] | |||
| sub_sample = np.random.permutation(np.arange(N)) | |||
| sub_sample = sub_sample[:MAX_SAMPLE] | |||
| x = cp.Variable(nb_cost_mat_m.shape[1]) | |||
| cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample]) | |||
| prob = cp.Problem(cp.Minimize(cost), [x >= 0]) | |||
| prob.solve() | |||
| edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0] | |||
| edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new] | |||
| residual = prob.value | |||
| return edit_costs_new, residual | |||
| def optimize_costs_classif_unlabeled(nb_cost_mat, Y): | |||
| """ | |||
| Optimize edit costs to fit dis_k_vec according to edit operations in | |||
| nb_cost_mat | |||
| ! take care that nb_cost_mat do not contains 0 lines | |||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit | |||
| operations for each pair of graph | |||
| :param dis_k_vec: {-1,1}^N vector of common classes | |||
| """ | |||
| # import cvxpy as cp | |||
| from ml import reg_log | |||
| # import pickle | |||
| # pickle.dump([nb_cost_mat, Y], open('debug', 'wb')) | |||
| nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] | |||
| for x in nb_cost_mat]) | |||
| w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True) | |||
| edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0] | |||
| residual = J[-1] | |||
| return edit_costs_new, residual | |||
| def optimize_costs_classif(nb_cost_mat, Y): | |||
| """ | |||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||
| ! take care that nb_cost_mat do not contains 0 lines | |||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||
| :param dis_k_vec: {-1,1}^N vector of common classes | |||
| """ | |||
| #import pickle | |||
| # pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb")) | |||
| from ml import reg_log | |||
| w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True) | |||
| return w, J[-1] | |||
| def optimize_costs(nb_cost_mat, dis_k_vec): | |||
| """ | |||
| Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat | |||
| ! take care that nb_cost_mat do not contains 0 lines | |||
| :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph | |||
| :param dis_k_vec: The N distances to fit | |||
| """ | |||
| import cvxpy as cp | |||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||
| cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| prob = cp.Problem(cp.Minimize(cost), constraints) | |||
| prob.solve() | |||
| edit_costs_new = x.value | |||
| residual = prob.value | |||
| return edit_costs_new, residual | |||
| def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1], | |||
| y_distance=euclid_d, | |||
| mode='reg', unlabeled=False, | |||
| ed_method='BIPARTITE', | |||
| **kwargs): | |||
| N = len(y) | |||
| G_pairs = [] | |||
| distances_vec = [] | |||
| for i in range(N): | |||
| for j in range(i+1, N): | |||
| G_pairs.append([i, j]) | |||
| distances_vec.append(y_distance(y[i], y[j])) | |||
| ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method, **kwargs) | |||
| residual_list = [sum_squares(ged_vec_init, distances_vec)] | |||
| if (mode == 'reg'): | |||
| if unlabeled: | |||
| method_optim = optimize_costs_unlabeled | |||
| else: | |||
| method_optim = optimize_costs | |||
| elif (mode == 'classif'): | |||
| if unlabeled: | |||
| method_optim = optimize_costs_classif_unlabeled | |||
| else: | |||
| method_optim = optimize_costs_classif | |||
| ite_max = 5 | |||
| for i in range(ite_max): | |||
| print('ite', i + 1, '/', ite_max, ':') | |||
| # compute GEDs and numbers of edit operations. | |||
| edit_costs_new, residual = method_optim( | |||
| np.array(n_edit_operations), distances_vec) | |||
| ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method, **kwargs) | |||
| residual_list.append(sum_squares(ged_vec, distances_vec)) | |||
| return edit_costs_new | |||
| @@ -0,0 +1,100 @@ | |||
| import sys | |||
| def run_xp(ds_name, output_file, unlabeled, mode, y_distance, ed_method): | |||
| from gklearn.dataset import Dataset | |||
| from gklearn.experiments import DATASET_ROOT | |||
| from learning import xp_knn | |||
| ds = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
| ds.remove_labels(node_attrs=ds.node_attrs, edge_attrs=ds.edge_attrs) # @todo: ged can not deal with sym and unsym labels. | |||
| Gn = ds.graphs | |||
| y_all = ds.targets | |||
| resu = {} | |||
| resu['y_distance'] = y_distance | |||
| resu['dataset'] = ds_name | |||
| unlabeled = (len(ds.node_labels) == 0 and len(ds.edge_labels) == 0) | |||
| results = xp_knn(Gn, y_all, y_distance=y_distances[y_distance], | |||
| mode=mode, | |||
| unlabeled=unlabeled, ed_method=ed_method, | |||
| node_labels=ds.node_labels, edge_labels=ds.edge_labels) | |||
| resu['results'] = results | |||
| resu['unlabeled'] = unlabeled | |||
| resu['mode'] = mode | |||
| resu['ed_method'] = ed_method | |||
| pickle.dump(resu, open(output_result, 'wb')) | |||
| return output_result | |||
| def run_from_args(): | |||
| import argparse | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument("dataset", help="path to / name of the dataset to predict") | |||
| parser.add_argument( | |||
| "output_file", help="path to file which will contains the results") | |||
| parser.add_argument("-u", "--unlabeled", help="Specify that the dataset is unlabeled graphs", | |||
| action="store_true") | |||
| parser.add_argument("-m", "--mode", type=str, choices=['reg', 'classif'], | |||
| help="Specify if the dataset a classification or regression problem") | |||
| parser.add_argument("-y", "--y_distance", type=str, choices=['euclidean', 'manhattan', 'classif'], | |||
| default='euclid', | |||
| help="Specify the distance on y to fit the costs") | |||
| args = parser.parse_args() | |||
| dataset = args.dataset | |||
| output_result = args.output_file | |||
| unlabeled = args.unlabeled | |||
| mode = args.mode | |||
| print(args) | |||
| y_distances = { | |||
| 'euclidean': euclid_d, | |||
| 'manhattan': man_d, | |||
| 'classif': classif_d | |||
| } | |||
| y_distance = y_distances['euclid'] | |||
| run_xp(dataset, output_result, unlabeled, mode, y_distance) | |||
| print("Fini") | |||
| if __name__ == "__main__": | |||
| import pickle | |||
| import os | |||
| from distances import euclid_d, man_d, classif_d | |||
| y_distances = { | |||
| 'euclidean': euclid_d, | |||
| 'manhattan': man_d, | |||
| 'classif': classif_d | |||
| } | |||
| # Read arguments. | |||
| if len(sys.argv) > 1: | |||
| run_from_args() | |||
| else: | |||
| from sklearn.model_selection import ParameterGrid | |||
| # Get task grid. | |||
| Edit_Cost_List = ['BIPARTITE', 'IPFP'] | |||
| Dataset_list = ['Alkane_unlabeled', 'Acyclic', 'Chiral', 'Vitamin_D', | |||
| 'Steroid'] | |||
| Dis_List = ['euclidean', 'manhattan'] | |||
| task_grid = ParameterGrid({'edit_cost': Edit_Cost_List[0:1], | |||
| 'dataset': Dataset_list[1:2], | |||
| 'distance': Dis_List[:]}) | |||
| unlabeled = False # @todo: Not actually used. | |||
| mode = 'reg' | |||
| # Run. | |||
| for task in list(task_grid): | |||
| print() | |||
| print(task) | |||
| output_result = 'outputs/results.' + '.'.join([task['dataset'], task['edit_cost'], task['distance']]) + '.pkl' | |||
| if not os.path.isfile(output_result): | |||
| run_xp(task['dataset'], output_result, unlabeled, mode, task['distance'], task['edit_cost']) | |||
| @@ -0,0 +1,15 @@ | |||
| import numpy as np | |||
| def vec2sym_mat(v): | |||
| """ | |||
| Convert a vector encoding a symmetric matrix into a matrix | |||
| See Golub and Van Loan, Matrix Computations, 3rd edition, p21 | |||
| """ | |||
| n = int((-1+np.sqrt(1+8*len(v)))/2) # second order resolution | |||
| M = np.zeros((n, n)) | |||
| for i in range(n): | |||
| for j in range(i, n): | |||
| # Golub van Loan, Matrix Computations, Eq. 1.2.2, p21 | |||
| M[i, j] = M[j, i] = v[i*n - (i+1)*(i)//2 + j] | |||
| return M | |||