2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time. 3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.tags/v0.1
| @@ -3,12 +3,15 @@ A python package for graph kernels. | |||
| ## Requirements | |||
| * numpy - 1.13.3 | |||
| * scipy - 1.0.0 | |||
| * matplotlib - 2.1.0 | |||
| * networkx - 2.0 | |||
| * sklearn - 0.19.1 | |||
| * tabulate - 0.8.2 | |||
| numpy==1.14.5 | |||
| scipy==1.1.0 | |||
| matplotlib==2.2.2 | |||
| networkx==2.1 | |||
| scikit-learn==0.19.1 | |||
| tabulate==0.8.2 | |||
| tqdm==4.23.4 | |||
| control==0.7.0 (for generalized random walk kernels only) | |||
| slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example) | |||
| ## Results with minimal test RMSE for each kernel on dataset Asyclic | |||
| @@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\% | |||
| | WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" | | |||
| | WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" | | |||
| | Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" | | |||
| | Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75" | | |||
| | Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha ': '0.1' | 0.56"/1.16"±0.75" | | |||
| | Cyclic pattern | | | | | | | |||
| | Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" | | |||
| @@ -3,106 +3,66 @@ dslist = [ | |||
| 'name': 'Acyclic', | |||
| 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression' | |||
| }, # node_labeled | |||
| { | |||
| 'name': 'COIL-DEL', | |||
| 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
| }, # edge_labeled | |||
| }, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| { | |||
| 'name': 'PAH', | |||
| 'dataset': '../datasets/PAH/dataset.ds', | |||
| }, # unlabeled | |||
| { | |||
| 'name': 'Mutagenicity', | |||
| 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
| }, # fully_labeled | |||
| { | |||
| 'name': 'MAO', | |||
| 'dataset': '../datasets/MAO/dataset.ds', | |||
| }, | |||
| }, # node/edge symb | |||
| { | |||
| 'name': 'MUTAG', | |||
| 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': { | |||
| 'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||
| } | |||
| }, | |||
| }, # node/edge symb | |||
| { | |||
| 'name': 'Alkane', | |||
| 'dataset': '../datasets/Alkane/dataset.ds', | |||
| 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||
| }, | |||
| { | |||
| 'name': 'BZR', | |||
| 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||
| }, | |||
| }, # contains single node graph, node symb | |||
| # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
| { | |||
| 'name': 'COX2', | |||
| 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||
| }, | |||
| 'name': 'Mutagenicity', | |||
| 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
| }, # node/edge symb | |||
| { | |||
| 'name': 'ENZYMES', | |||
| 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
| }, | |||
| { | |||
| 'name': 'DHFR', | |||
| 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||
| }, | |||
| { | |||
| 'name': 'SYNTHETIC', | |||
| 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||
| }, | |||
| { | |||
| 'name': 'MSRC9', | |||
| 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||
| }, | |||
| { | |||
| 'name': 'MSRC21', | |||
| 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||
| }, | |||
| }, # node symb/nsymb | |||
| # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
| { | |||
| 'name': 'FIRSTMM_DB', | |||
| 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||
| }, | |||
| { | |||
| 'name': 'PROTEINS', | |||
| 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||
| }, | |||
| { | |||
| 'name': 'PROTEINS_full', | |||
| 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||
| 'name': 'Letter-med', | |||
| 'dataset': '../datasets/Letter-med/Letter-med_A.txt' | |||
| }, | |||
| # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
| # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
| # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
| # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
| { | |||
| 'name': 'D&D', | |||
| 'dataset': '../datasets/D&D/DD.mat', | |||
| 'extra_params': { | |||
| 'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||
| } | |||
| }, | |||
| { | |||
| 'name': 'AIDS', | |||
| 'dataset': '../datasets/AIDS/AIDS_A.txt' | |||
| }, | |||
| { | |||
| 'name': 'NCI1', | |||
| 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| 'extra_params': { | |||
| 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
| } | |||
| }, | |||
| { | |||
| 'name': 'NCI109', | |||
| 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| 'extra_params': { | |||
| 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
| } | |||
| }, | |||
| { | |||
| 'name': 'NCI-HIV', | |||
| 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||
| }, | |||
| }, # node symb | |||
| # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
| # # not working below | |||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| @@ -110,3 +70,116 @@ dslist = [ | |||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| ] | |||
| # dslist = [ | |||
| # { | |||
| # 'name': 'Acyclic', | |||
| # 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| # 'task': 'regression' | |||
| # }, # node_labeled | |||
| # { | |||
| # 'name': 'COIL-DEL', | |||
| # 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
| # }, # edge_labeled | |||
| # { | |||
| # 'name': 'PAH', | |||
| # 'dataset': '../datasets/PAH/dataset.ds', | |||
| # }, # unlabeled | |||
| # { | |||
| # 'name': 'Mutagenicity', | |||
| # 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
| # }, # fully_labeled | |||
| # { | |||
| # 'name': 'MAO', | |||
| # 'dataset': '../datasets/MAO/dataset.ds', | |||
| # }, | |||
| # { | |||
| # 'name': 'MUTAG', | |||
| # 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| # 'extra_params': { | |||
| # 'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||
| # } | |||
| # }, | |||
| # { | |||
| # 'name': 'Alkane', | |||
| # 'dataset': '../datasets/Alkane/dataset.ds', | |||
| # 'task': 'regression', | |||
| # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||
| # }, | |||
| # { | |||
| # 'name': 'BZR', | |||
| # 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'COX2', | |||
| # 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'ENZYMES', | |||
| # 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'DHFR', | |||
| # 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'SYNTHETIC', | |||
| # 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'MSRC9', | |||
| # 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'MSRC21', | |||
| # 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'FIRSTMM_DB', | |||
| # 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'PROTEINS', | |||
| # 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'PROTEINS_full', | |||
| # 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'D&D', | |||
| # 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': { | |||
| # 'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||
| # } | |||
| # }, | |||
| # { | |||
| # 'name': 'AIDS', | |||
| # 'dataset': '../datasets/AIDS/AIDS_A.txt' | |||
| # }, | |||
| # { | |||
| # 'name': 'NCI1', | |||
| # 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| # 'extra_params': { | |||
| # 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
| # } | |||
| # }, | |||
| # { | |||
| # 'name': 'NCI109', | |||
| # 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| # 'extra_params': { | |||
| # 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
| # } | |||
| # }, | |||
| # { | |||
| # 'name': 'NCI-HIV', | |||
| # 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||
| # }, | |||
| # # # not working below | |||
| # # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| # # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
| # # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| # ] | |||
| @@ -1,56 +1,157 @@ | |||
| import functools | |||
| from libs import * | |||
| from pygraph.kernels.spKernel import spkernel | |||
| from pygraph.utils.kernels import deltakernel, kernelsum | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| dslist = [ | |||
| # {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled | |||
| # dslist = [ | |||
| # {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||
| # # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||
| {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled | |||
| # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, | |||
| # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||
| # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, | |||
| # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, | |||
| # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, | |||
| # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, | |||
| # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, | |||
| # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, | |||
| # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, | |||
| # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, | |||
| # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, | |||
| # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||
| # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||
| # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, | |||
| # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, | |||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, | |||
| # # not working below | |||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| ] | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| # # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
| # # # not working below | |||
| # # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| # # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
| # # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| # ] | |||
| import ast | |||
| ds = ast.literal_eval(sys.argv[1]) | |||
| estimator = spkernel | |||
| param_grid_precomputed = {} | |||
| param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||
| {'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||
| for ds in dslist: | |||
| print() | |||
| print(ds['name']) | |||
| model_selection_for_precomputed_kernel( | |||
| ds['dataset'], estimator, param_grid_precomputed, | |||
| (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
| (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||
| datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
| print() | |||
| mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||
| param_grid_precomputed = { | |||
| 'node_kernels': [{ | |||
| 'symb': deltakernel, | |||
| 'nsymb': rbf_kernel, | |||
| 'mix': mixkernel | |||
| }] | |||
| } | |||
| param_grid = [{ | |||
| 'C': np.logspace(-10, 10, num=41, base=10) | |||
| }, { | |||
| 'alpha': np.logspace(-10, 10, num=41, base=10) | |||
| }] | |||
| print() | |||
| print(ds['name']) | |||
| model_selection_for_precomputed_kernel( | |||
| ds['dataset'], | |||
| estimator, | |||
| param_grid_precomputed, | |||
| (param_grid[1] | |||
| if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
| (ds['task'] if 'task' in ds else 'classification'), | |||
| NUM_TRIALS=30, | |||
| datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
| ds_name=ds['name']) | |||
| # %lprun -f spkernel \ | |||
| # model_selection_for_precomputed_kernel( \ | |||
| # ds['dataset'], estimator, param_grid_precomputed, \ | |||
| # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||
| # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||
| # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||
| # extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
| print() | |||
| # import functools | |||
| # from libs import * | |||
| # from pygraph.kernels.spKernel import spkernel | |||
| # from pygraph.utils.kernels import deltakernel, kernelsum | |||
| # from sklearn.metrics.pairwise import rbf_kernel | |||
| # dslist = [ | |||
| # {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||
| # # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||
| # # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||
| # # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| # # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| # # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||
| # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||
| # # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
| # # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| # # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
| # # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
| # # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
| # # # not working below | |||
| # # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| # # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
| # # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| # ] | |||
| # estimator = spkernel | |||
| # mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||
| # param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||
| # param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||
| # {'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||
| # for ds in dslist: | |||
| # print() | |||
| # print(ds['name']) | |||
| # model_selection_for_precomputed_kernel( | |||
| # ds['dataset'], estimator, param_grid_precomputed, | |||
| # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
| # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||
| # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
| # extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
| # ds_name=ds['name']) | |||
| # # %lprun -f spkernel \ | |||
| # # model_selection_for_precomputed_kernel( \ | |||
| # # ds['dataset'], estimator, param_grid_precomputed, \ | |||
| # # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||
| # # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||
| # # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||
| # # extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
| # print() | |||
| @@ -1 +0,0 @@ | |||
| ljia@ljia-Precision-7520.4716:1530265749 | |||
| @@ -9,6 +9,9 @@ sys.path.insert(0, "../") | |||
| from tqdm import tqdm | |||
| import time | |||
| from itertools import combinations_with_replacement, product | |||
| from functools import partial | |||
| from joblib import Parallel, delayed | |||
| from multiprocessing import Pool | |||
| import networkx as nx | |||
| import numpy as np | |||
| @@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph | |||
| from pygraph.utils.graphdataset import get_dataset_attributes | |||
| def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||
| def spkernel(*args, | |||
| node_label='atom', | |||
| edge_weight=None, | |||
| node_kernels=None, | |||
| n_jobs=None): | |||
| """Calculate shortest-path kernels between graphs. | |||
| Parameters | |||
| @@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||
| if len(Gn) != len_gn: | |||
| print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||
| (len_gn - len(Gn))) | |||
| start_time = time.time() | |||
| pool = Pool(n_jobs) | |||
| # get shortest path graphs of Gn | |||
| Gn = [ | |||
| getSPGraph(G, edge_weight=edge_weight) | |||
| for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||
| ] | |||
| getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight) | |||
| result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
| for i in result_sp: | |||
| Gn[i[0]] = i[1] | |||
| # Gn = [ | |||
| # getSPGraph(G, edge_weight=edge_weight) | |||
| # for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||
| # ] | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| pbar = tqdm( | |||
| total=((len(Gn) + 1) * len(Gn) / 2), | |||
| desc='calculating kernels', | |||
| file=sys.stdout) | |||
| do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # chunksize = 2000 # int(len(list(itr)) / n_jobs) | |||
| # for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)): | |||
| # Kmatrix[i][j] = kernel | |||
| # Kmatrix[j][i] = kernel | |||
| result_perf = pool.map(do_partial, itr) | |||
| pool.close() | |||
| pool.join() | |||
| # result_perf = Parallel( | |||
| # n_jobs=n_jobs, verbose=10)( | |||
| # delayed(do_partial)(ij) | |||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
| # result_perf = [ | |||
| # do_partial(ij) | |||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # ] | |||
| for i in result_perf: | |||
| Kmatrix[i[0]][i[1]] = i[2] | |||
| Kmatrix[i[1]][i[0]] = i[2] | |||
| # pbar = tqdm( | |||
| # total=((len(Gn) + 1) * len(Gn) / 2), | |||
| # desc='calculating kernels', | |||
| # file=sys.stdout) | |||
| # if ds_attrs['node_labeled']: | |||
| # # node symb and non-synb labeled | |||
| # if ds_attrs['node_attr_dim'] > 0: | |||
| # if ds_attrs['is_directed']: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['mix'] | |||
| # try: | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn(n11[node_label], n21[node_label], [ | |||
| # n11['attributes'] | |||
| # ], [n21['attributes']]) * kn( | |||
| # n12[node_label], n22[node_label], | |||
| # [n12['attributes']], [n22['attributes']]) | |||
| # Kmatrix[i][j] += kn1 | |||
| # except KeyError: # missing labels or attributes | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # else: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['mix'] | |||
| # try: | |||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn(n11[node_label], n21[node_label], [ | |||
| # n11['attributes'] | |||
| # ], [n21['attributes']]) * kn( | |||
| # n12[node_label], n22[node_label], | |||
| # [n12['attributes']], [n22['attributes']]) | |||
| # kn2 = kn(n11[node_label], n22[node_label], [ | |||
| # n11['attributes'] | |||
| # ], [n22['attributes']]) * kn( | |||
| # n12[node_label], n21[node_label], | |||
| # [n12['attributes']], [n21['attributes']]) | |||
| # Kmatrix[i][j] += kn1 + kn2 | |||
| # except KeyError: # missing labels or attributes | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # # node symb labeled | |||
| # else: | |||
| # if ds_attrs['is_directed']: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['symb'] | |||
| # try: | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn(n11[node_label], | |||
| # n21[node_label]) * kn( | |||
| # n12[node_label], n22[node_label]) | |||
| # Kmatrix[i][j] += kn1 | |||
| # except KeyError: # missing labels | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # else: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['symb'] | |||
| # try: | |||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn(n11[node_label], | |||
| # n21[node_label]) * kn( | |||
| # n12[node_label], n22[node_label]) | |||
| # kn2 = kn(n11[node_label], | |||
| # n22[node_label]) * kn( | |||
| # n12[node_label], n21[node_label]) | |||
| # Kmatrix[i][j] += kn1 + kn2 | |||
| # except KeyError: # missing labels | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # else: | |||
| # # node non-synb labeled | |||
| # if ds_attrs['node_attr_dim'] > 0: | |||
| # if ds_attrs['is_directed']: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['nsymb'] | |||
| # try: | |||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn([n11['attributes']], | |||
| # [n21['attributes']]) * kn( | |||
| # [n12['attributes']], | |||
| # [n22['attributes']]) | |||
| # Kmatrix[i][j] += kn1 | |||
| # except KeyError: # missing attributes | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # else: | |||
| # for i, j in combinations_with_replacement( | |||
| # range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kn = node_kernels['nsymb'] | |||
| # try: | |||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| # j].nodes[e2[1]] | |||
| # kn1 = kn([n11['attributes']], | |||
| # [n21['attributes']]) * kn( | |||
| # [n12['attributes']], | |||
| # [n22['attributes']]) | |||
| # kn2 = kn([n11['attributes']], | |||
| # [n22['attributes']]) * kn( | |||
| # [n12['attributes']], | |||
| # [n21['attributes']]) | |||
| # Kmatrix[i][j] += kn1 + kn2 | |||
| # except KeyError: # missing attributes | |||
| # pass | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| # # node unlabeled | |||
| # else: | |||
| # for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||
| # for e1, e2 in product( | |||
| # Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # Kmatrix[i][j] += 1 | |||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||
| # pbar.update(1) | |||
| run_time = time.time() - start_time | |||
| print( | |||
| "\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
| % (len(Gn), run_time)) | |||
| return Kmatrix, run_time, idx | |||
| def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
| i = ij[0] | |||
| j = ij[1] | |||
| Kmatrix = 0 | |||
| if ds_attrs['node_labeled']: | |||
| # node symb and non-synb labeled | |||
| if ds_attrs['node_attr_dim'] > 0: | |||
| if ds_attrs['is_directed']: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['mix'] | |||
| try: | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], n21[node_label], [ | |||
| n11['attributes'] | |||
| ], [n21['attributes']]) * kn( | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['mix'] | |||
| try: | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn( | |||
| n11[node_label], n21[node_label], | |||
| [n11['attributes']], [n21['attributes']]) * kn( | |||
| n12[node_label], n22[node_label], | |||
| [n12['attributes']], [n22['attributes']]) | |||
| Kmatrix[i][j] += kn1 | |||
| except KeyError: # missing labels or attributes | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| Kmatrix += kn1 | |||
| except KeyError: # missing labels or attributes | |||
| pass | |||
| else: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['mix'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], n21[node_label], [ | |||
| n11['attributes'] | |||
| ], [n21['attributes']]) * kn( | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['mix'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn( | |||
| n11[node_label], n21[node_label], | |||
| [n11['attributes']], [n21['attributes']]) * kn( | |||
| n12[node_label], n22[node_label], | |||
| [n12['attributes']], [n22['attributes']]) | |||
| kn2 = kn(n11[node_label], n22[node_label], [ | |||
| n11['attributes'] | |||
| ], [n22['attributes']]) * kn( | |||
| kn2 = kn( | |||
| n11[node_label], n22[node_label], | |||
| [n11['attributes']], [n22['attributes']]) * kn( | |||
| n12[node_label], n21[node_label], | |||
| [n12['attributes']], [n21['attributes']]) | |||
| Kmatrix[i][j] += kn1 + kn2 | |||
| except KeyError: # missing labels or attributes | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| Kmatrix += kn1 + kn2 | |||
| except KeyError: # missing labels or attributes | |||
| pass | |||
| # node symb labeled | |||
| else: | |||
| if ds_attrs['is_directed']: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['symb'] | |||
| try: | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], | |||
| n21[node_label]) * kn( | |||
| n12[node_label], n22[node_label]) | |||
| Kmatrix[i][j] += kn1 | |||
| except KeyError: # missing labels | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['symb'] | |||
| try: | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||
| n12[node_label], n22[node_label]) | |||
| Kmatrix += kn1 | |||
| except KeyError: # missing labels | |||
| pass | |||
| else: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['symb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], | |||
| n21[node_label]) * kn( | |||
| n12[node_label], n22[node_label]) | |||
| kn2 = kn(n11[node_label], | |||
| n22[node_label]) * kn( | |||
| n12[node_label], n21[node_label]) | |||
| Kmatrix[i][j] += kn1 + kn2 | |||
| except KeyError: # missing labels | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['symb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||
| n12[node_label], n22[node_label]) | |||
| kn2 = kn(n11[node_label], n22[node_label]) * kn( | |||
| n12[node_label], n21[node_label]) | |||
| Kmatrix += kn1 + kn2 | |||
| except KeyError: # missing labels | |||
| pass | |||
| else: | |||
| # node non-synb labeled | |||
| if ds_attrs['node_attr_dim'] > 0: | |||
| if ds_attrs['is_directed']: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['nsymb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn([n11['attributes']], | |||
| [n21['attributes']]) * kn( | |||
| [n12['attributes']], | |||
| [n22['attributes']]) | |||
| Kmatrix[i][j] += kn1 | |||
| except KeyError: # missing attributes | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['nsymb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn( | |||
| [n11['attributes']], [n21['attributes']]) * kn( | |||
| [n12['attributes']], [n22['attributes']]) | |||
| Kmatrix += kn1 | |||
| except KeyError: # missing attributes | |||
| pass | |||
| else: | |||
| for i, j in combinations_with_replacement( | |||
| range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| kn = node_kernels['nsymb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn([n11['attributes']], | |||
| [n21['attributes']]) * kn( | |||
| [n12['attributes']], | |||
| [n22['attributes']]) | |||
| kn2 = kn([n11['attributes']], | |||
| [n22['attributes']]) * kn( | |||
| [n12['attributes']], | |||
| [n21['attributes']]) | |||
| Kmatrix[i][j] += kn1 + kn2 | |||
| except KeyError: # missing attributes | |||
| pass | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| # node unlabeled | |||
| else: | |||
| for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| Kmatrix[i][j] += 1 | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| pbar.update(1) | |||
| kn = node_kernels['nsymb'] | |||
| try: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
| i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
| j].nodes[e2[1]] | |||
| kn1 = kn( | |||
| [n11['attributes']], [n21['attributes']]) * kn( | |||
| [n12['attributes']], [n22['attributes']]) | |||
| kn2 = kn( | |||
| [n11['attributes']], [n22['attributes']]) * kn( | |||
| [n12['attributes']], [n21['attributes']]) | |||
| Kmatrix += kn1 + kn2 | |||
| except KeyError: # missing attributes | |||
| pass | |||
| # node unlabeled | |||
| else: | |||
| for e1, e2 in product( | |||
| Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| Kmatrix += 1 | |||
| run_time = time.time() - start_time | |||
| print( | |||
| "\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
| % (len(Gn), run_time)) | |||
| return i, j, Kmatrix | |||
| return Kmatrix, run_time, idx | |||
| def wrap_getSPGraph(Gn, weight, i): | |||
| return i, getSPGraph(Gn[i], edge_weight=weight) | |||
| @@ -1,11 +1,32 @@ | |||
| import numpy as np | |||
| from matplotlib import pyplot as plt | |||
| from sklearn.kernel_ridge import KernelRidge | |||
| from sklearn.svm import SVC | |||
| from sklearn.metrics import accuracy_score, mean_squared_error | |||
| from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
| from joblib import Parallel, delayed | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| import os | |||
| import time | |||
| from os.path import basename, splitext | |||
| from pygraph.utils.graphfiles import loadDataset | |||
| from tqdm import tqdm | |||
| def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| param_grid_precomputed, param_grid, | |||
| model_type, NUM_TRIALS=30, | |||
| def model_selection_for_precomputed_kernel(datafile, | |||
| estimator, | |||
| param_grid_precomputed, | |||
| param_grid, | |||
| model_type, | |||
| NUM_TRIALS=30, | |||
| datafile_y=None, | |||
| extra_params=None, | |||
| ds_name='ds-unknown'): | |||
| ds_name='ds-unknown', | |||
| n_jobs=1): | |||
| """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | |||
| Parameters | |||
| @@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| >>> | |||
| >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') | |||
| """ | |||
| import numpy as np | |||
| from matplotlib import pyplot as plt | |||
| from sklearn.kernel_ridge import KernelRidge | |||
| from sklearn.svm import SVC | |||
| from sklearn.metrics import accuracy_score, mean_squared_error | |||
| from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| import os | |||
| from os.path import basename, splitext | |||
| from pygraph.utils.graphfiles import loadDataset | |||
| from tqdm import tqdm | |||
| tqdm.monitor_interval = 0 | |||
| results_dir = '../notebooks/results/' + estimator.__name__ | |||
| if not os.path.exists(results_dir): | |||
| os.makedirs(results_dir) | |||
| # open file to save all results for this dataset. | |||
| with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||
| fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n') | |||
| # setup the model type | |||
| model_type = model_type.lower() | |||
| if model_type != 'regression' and model_type != 'classification': | |||
| raise Exception( | |||
| 'The model type is incorrect! Please choose from regression or classification.') | |||
| print() | |||
| print('--- This is a %s problem ---' % model_type) | |||
| fresults.write('This is a %s problem.\n\n' % model_type) | |||
| # a string to save all the results. | |||
| str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
| # Load the dataset | |||
| print() | |||
| print('\nI. Loading dataset from file...') | |||
| dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params) | |||
| # setup the model type | |||
| model_type = model_type.lower() | |||
| if model_type != 'regression' and model_type != 'classification': | |||
| raise Exception( | |||
| 'The model type is incorrect! Please choose from regression or classification.' | |||
| ) | |||
| print() | |||
| print('--- This is a %s problem ---' % model_type) | |||
| str_fw += 'This is a %s problem.\n\n' % model_type | |||
| # Load the dataset | |||
| print() | |||
| print('\nI. Loading dataset from file...') | |||
| dataset, y = loadDataset( | |||
| datafile, filename_y=datafile_y, extra_params=extra_params) | |||
| # import matplotlib.pyplot as plt | |||
| # import matplotlib.pyplot as plt | |||
| # import networkx as nx | |||
| # nx.draw_networkx(dataset[30]) | |||
| # plt.show() | |||
| # Grid of parameters with a discrete number of values for each. | |||
| param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
| param_list = list(ParameterGrid(param_grid)) | |||
| # np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||
| # [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||
| # np.savetxt(results_name_pre + 'param_grid.dt', | |||
| # [[key, value] for key, value in sorted(param_grid)]) | |||
| # Grid of parameters with a discrete number of values for each. | |||
| param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
| param_list = list(ParameterGrid(param_grid)) | |||
| # np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||
| # [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||
| # np.savetxt(results_name_pre + 'param_grid.dt', | |||
| # [[key, value] for key, value in sorted(param_grid)]) | |||
| gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed | |||
| gram_matrix_time = [] # a list to store time to calculate gram matrices | |||
| param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones | |||
| gram_matrices = [ | |||
| ] # a list to store gram matrices for all param_grid_precomputed | |||
| gram_matrix_time = [ | |||
| ] # a list to store time to calculate gram matrices | |||
| param_list_pre_revised = [ | |||
| ] # list to store param grids precomputed ignoring the useless ones | |||
| # calculate all gram matrices | |||
| print() | |||
| print('2. Calculating gram matrices. This could take a while...') | |||
| str_fw += '\nI. Gram matrices.\n\n' | |||
| tts = time.time() # start training time | |||
| nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
| for idx, params_out in enumerate(param_list_precomputed): | |||
| params_out['n_jobs'] = n_jobs | |||
| rtn_data = estimator(dataset, **params_out) | |||
| Kmatrix = rtn_data[0] | |||
| current_run_time = rtn_data[1] | |||
| if len(rtn_data) == 3: | |||
| idx_trim = rtn_data[2] # the index of trimmed graph list | |||
| y = [y[idx] for idx in idx_trim] | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| # remove graphs whose kernels with themselves are zeros | |||
| nb_g_ignore = 0 | |||
| for idx, diag in enumerate(Kmatrix_diag): | |||
| if diag == 0: | |||
| Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||
| Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||
| nb_g_ignore += 1 | |||
| # normalization | |||
| for i in range(len(Kmatrix)): | |||
| for j in range(i, len(Kmatrix)): | |||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| # calculate all gram matrices | |||
| print() | |||
| print('2. Calculating gram matrices. This could take a while...') | |||
| fresults.write('\nI. Gram matrices.\n\n') | |||
| nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
| for idx, params_out in enumerate(param_list_precomputed): | |||
| rtn_data = estimator(dataset, **params_out) | |||
| Kmatrix = rtn_data[0] | |||
| current_run_time = rtn_data[1] | |||
| if len(rtn_data) == 3: | |||
| idx_trim = rtn_data[2] # the index of trimmed graph list | |||
| y = [y[idx] for idx in idx_trim] | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| for i in range(len(Kmatrix)): | |||
| for j in range(i, len(Kmatrix)): | |||
| # if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0: | |||
| Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| print() | |||
| if params_out == {}: | |||
| print('the gram matrix is: ') | |||
| fresults.write('the gram matrix is:\n\n') | |||
| else: | |||
| print('the gram matrix with parameters', params_out, 'is: ') | |||
| fresults.write('the gram matrix with parameters %s is:\n\n' % params_out) | |||
| if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers | |||
| if params_out == {}: | |||
| print('the gram matrix is: ') | |||
| str_fw += 'the gram matrix is:\n\n' | |||
| else: | |||
| print('the gram matrix with parameters', params_out, 'is: ') | |||
| str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
| if len(Kmatrix) < 2: | |||
| nb_gm_ignore += 1 | |||
| print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
| str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
| else: | |||
| if np.isnan(Kmatrix).any( | |||
| ): # if the matrix contains elements that are not numbers | |||
| nb_gm_ignore += 1 | |||
| print('ignored, as it contains elements that are not numbers.') | |||
| fresults.write('ignored, as it contains elements that are not numbers.\n\n') | |||
| str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
| else: | |||
| print(Kmatrix) | |||
| fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n') | |||
| str_fw += np.array2string( | |||
| Kmatrix, | |||
| separator=',', | |||
| threshold=np.inf, | |||
| floatmode='unique') + '\n\n' | |||
| plt.matshow(Kmatrix) | |||
| plt.colorbar() | |||
| fig_file_name = results_dir + '/GM[ds]' + ds_name | |||
| @@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| gram_matrices.append(Kmatrix) | |||
| gram_matrix_time.append(current_run_time) | |||
| param_list_pre_revised.append(params_out) | |||
| print() | |||
| print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore)) | |||
| fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)) | |||
| fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n') | |||
| fresults.write(''.join(['{}: {}\n'.format(idx, params_out) | |||
| for idx, params_out in enumerate(param_list_precomputed)])) | |||
| if nb_g_ignore > 0: | |||
| print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
| str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
| print() | |||
| print( | |||
| '{} gram matrices are calculated, {} of which are ignored.'.format( | |||
| len(param_list_precomputed), nb_gm_ignore)) | |||
| str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||
| str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
| str_fw += ''.join([ | |||
| '{}: {}\n'.format(idx, params_out) | |||
| for idx, params_out in enumerate(param_list_precomputed) | |||
| ]) | |||
| print() | |||
| print('3. Fitting and predicting using nested cross validation. This could really take a while...') | |||
| # Arrays to store scores | |||
| train_pref = np.zeros( | |||
| (NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
| val_pref = np.zeros( | |||
| (NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
| test_pref = np.zeros( | |||
| (NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
| # Loop for each trial | |||
| pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list), | |||
| desc='calculate performance', file=sys.stdout) | |||
| for trial in range(NUM_TRIALS): # Test set level | |||
| # loop for each outer param tuple | |||
| for index_out, params_out in enumerate(param_list_pre_revised): | |||
| # split gram matrix and y to app and test sets. | |||
| X_app, X_test, y_app, y_test = train_test_split( | |||
| gram_matrices[index_out], y, test_size=0.1) | |||
| split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||
| # split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||
| X_app = X_app[:, split_index_app] | |||
| X_test = X_test[:, split_index_app] | |||
| y_app = np.array(y_app) | |||
| y_test = np.array(y_test) | |||
| # loop for each inner param tuple | |||
| for index_in, params_in in enumerate(param_list): | |||
| inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||
| current_train_perf = [] | |||
| current_valid_perf = [] | |||
| current_test_perf = [] | |||
| # For regression use the Kernel Ridge method | |||
| try: | |||
| if model_type == 'regression': | |||
| KR = KernelRidge(kernel='precomputed', **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split(X_app): | |||
| KR.fit(X_app[train_index, :] | |||
| [:, train_index], y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = KR.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = KR.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| y_pred_test = KR.predict(X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append( | |||
| np.sqrt(mean_squared_error(y_app[train_index], y_pred_train))) | |||
| current_valid_perf.append( | |||
| np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid))) | |||
| current_test_perf.append( | |||
| np.sqrt(mean_squared_error(y_test, y_pred_test))) | |||
| # For clcassification use SVM | |||
| else: | |||
| KR = SVC(kernel='precomputed', **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split(X_app): | |||
| KR.fit(X_app[train_index, :] | |||
| [:, train_index], y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = KR.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = KR.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| y_pred_test = KR.predict( | |||
| X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append(accuracy_score( | |||
| y_app[train_index], y_pred_train)) | |||
| current_valid_perf.append(accuracy_score( | |||
| y_app[valid_index], y_pred_valid)) | |||
| current_test_perf.append( | |||
| accuracy_score(y_test, y_pred_test)) | |||
| except ValueError: | |||
| print(sys.exc_info()[0]) | |||
| print(params_out, params_in) | |||
| # average performance on inner splits | |||
| train_pref[trial][index_out][index_in] = np.mean( | |||
| current_train_perf) | |||
| val_pref[trial][index_out][index_in] = np.mean( | |||
| current_valid_perf) | |||
| test_pref[trial][index_out][index_in] = np.mean( | |||
| current_test_perf) | |||
| pbar.update(1) | |||
| pbar.clear() | |||
| print() | |||
| if len(gram_matrices) == 0: | |||
| print('all gram matrices are ignored, no results obtained.') | |||
| str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||
| else: | |||
| print( | |||
| '3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
| ) | |||
| pool = Pool(n_jobs) | |||
| trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
| result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
| train_pref = [item[0] for item in result_perf] | |||
| val_pref = [item[1] for item in result_perf] | |||
| test_pref = [item[2] for item in result_perf] | |||
| pool.close() | |||
| pool.join() | |||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
| # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||
| # train_pref = [item[0] for item in result_perf] | |||
| # val_pref = [item[1] for item in result_perf] | |||
| # test_pref = [item[2] for item in result_perf] | |||
| # pbar.clear() | |||
| # np.save(results_name_pre + 'train_pref.dt', train_pref) | |||
| # np.save(results_name_pre + 'val_pref.dt', val_pref) | |||
| # np.save(results_name_pre + 'test_pref.dt', test_pref) | |||
| print() | |||
| print('4. Getting final performance...') | |||
| fresults.write('\nII. Performance.\n\n') | |||
| str_fw += '\nII. Performance.\n\n' | |||
| # averages and confidences of performances on outer trials for each combination of parameters | |||
| average_train_scores = np.mean(train_pref, axis=0) | |||
| average_val_scores = np.mean(val_pref, axis=0) | |||
| @@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
| std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||
| std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||
| if model_type == 'regression': | |||
| best_val_perf = np.amin(average_val_scores) | |||
| else: | |||
| best_val_perf = np.amax(average_val_scores) | |||
| best_params_index = np.where(average_val_scores == best_val_perf) | |||
| # find smallest val std with best val perf. | |||
| best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
| best_val_stds = [ | |||
| std_val_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| min_val_std = np.amin(best_val_stds) | |||
| best_params_index = np.where(std_val_scores == min_val_std) | |||
| best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]] | |||
| best_params_out = [ | |||
| param_list_pre_revised[i] for i in best_params_index[0] | |||
| ] | |||
| best_params_in = [param_list[i] for i in best_params_index[1]] | |||
| print('best_params_out: ', best_params_out) | |||
| print('best_params_in: ', best_params_in) | |||
| print() | |||
| print('best_val_perf: ', best_val_perf) | |||
| print('best_val_std: ', min_val_std) | |||
| fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out) | |||
| fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in) | |||
| fresults.write('best_val_perf: %s\n' % best_val_perf) | |||
| fresults.write('best_val_std: %s\n' % min_val_std) | |||
| str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||
| str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||
| str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
| str_fw += 'best_val_std: %s\n' % min_val_std | |||
| final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
| final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
| final_performance = [ | |||
| average_perf_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| final_confidence = [ | |||
| std_perf_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| print('final_performance: ', final_performance) | |||
| print('final_confidence: ', final_confidence) | |||
| fresults.write('final_performance: %s\n' % final_performance) | |||
| fresults.write('final_confidence: %s\n' % final_confidence) | |||
| train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
| train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
| str_fw += 'final_performance: %s\n' % final_performance | |||
| str_fw += 'final_confidence: %s\n' % final_confidence | |||
| train_performance = [ | |||
| average_train_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| train_std = [ | |||
| std_train_scores[value][best_params_index[1][idx]] | |||
| for idx, value in enumerate(best_params_index[0]) | |||
| ] | |||
| print('train_performance: %s' % train_performance) | |||
| print('train_std: ', train_std) | |||
| fresults.write('train_performance: %s\n' % train_performance) | |||
| fresults.write('train_std: %s\n\n' % train_std) | |||
| str_fw += 'train_performance: %s\n' % train_performance | |||
| str_fw += 'train_std: %s\n\n' % train_std | |||
| print() | |||
| tt_total = time.time() - tts # training time for all hyper-parameters | |||
| average_gram_matrix_time = np.mean(gram_matrix_time) | |||
| std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||
| best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]] | |||
| best_gram_matrix_time = [ | |||
| gram_matrix_time[i] for i in best_params_index[0] | |||
| ] | |||
| ave_bgmt = np.mean(best_gram_matrix_time) | |||
| std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||
| print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
| .format(average_gram_matrix_time, std_gram_matrix_time)) | |||
| print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt)) | |||
| fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n' | |||
| .format(average_gram_matrix_time, std_gram_matrix_time)) | |||
| fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt)) | |||
| print( | |||
| 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
| .format(average_gram_matrix_time, std_gram_matrix_time)) | |||
| print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
| ave_bgmt, std_bgmt)) | |||
| print( | |||
| 'total training time with all hyper-param choices: {:.2f}s'.format( | |||
| tt_total)) | |||
| str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
| str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
| str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
| # # save results to file | |||
| # np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
| @@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||
| # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||
| # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||
| # np.save(results_name_pre + 'best_params_index', best_params_index) | |||
| # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
| # np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
| @@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| # np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||
| # np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
| # np.save(results_name_pre + 'train_std.dt', train_std) | |||
| # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
| # np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
| # average_gram_matrix_time) | |||
| @@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| # std_gram_matrix_time) | |||
| # np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
| # best_gram_matrix_time) | |||
| # print out as table. | |||
| from collections import OrderedDict | |||
| from tabulate import tabulate | |||
| @@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
| param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
| table_dict['params'] = [{**param_out, **param_in} | |||
| for param_in in param_list for param_out in param_list_pre_revised] | |||
| table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out]) | |||
| for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)] | |||
| table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
| table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
| table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
| keyorder = ['params', 'train_perf', 'valid_perf', | |||
| 'test_perf', 'gram_matrix_time'] | |||
| table_dict['gram_matrix_time'] = [ | |||
| '{:.2f}'.format(gram_matrix_time[index_out]) | |||
| for param_in in param_list | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['valid_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
| std_val_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['test_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
| std_perf_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| table_dict['train_perf'] = [ | |||
| '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
| std_train_scores[index_out][index_in]) | |||
| for index_in, _ in enumerate(param_list) | |||
| for index_out, _ in enumerate(param_list_pre_revised) | |||
| ] | |||
| keyorder = [ | |||
| 'params', 'train_perf', 'valid_perf', 'test_perf', | |||
| 'gram_matrix_time' | |||
| ] | |||
| print() | |||
| tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||
| key=lambda i: keyorder.index(i[0]))), headers='keys') | |||
| tb_print = tabulate( | |||
| OrderedDict( | |||
| sorted(table_dict.items(), | |||
| key=lambda i: keyorder.index(i[0]))), | |||
| headers='keys') | |||
| print(tb_print) | |||
| fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print) | |||
| str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
| # open file to save all results for this dataset. | |||
| if not os.path.exists(results_dir): | |||
| os.makedirs(results_dir) | |||
| with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||
| fresults.write(str_fw) | |||
| fresults.close() | |||
| def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||
| # Arrays to store scores | |||
| train_pref = np.zeros((len(param_list_pre_revised), | |||
| len(param_list))) | |||
| val_pref = np.zeros((len(param_list_pre_revised), | |||
| len(param_list))) | |||
| test_pref = np.zeros((len(param_list_pre_revised), | |||
| len(param_list))) | |||
| # loop for each outer param tuple | |||
| for index_out, params_out in enumerate(param_list_pre_revised): | |||
| # split gram matrix and y to app and test sets. | |||
| X_app, X_test, y_app, y_test = train_test_split( | |||
| gram_matrices[index_out], y, test_size=0.1) | |||
| split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||
| # split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||
| X_app = X_app[:, split_index_app] | |||
| X_test = X_test[:, split_index_app] | |||
| y_app = np.array(y_app) | |||
| y_test = np.array(y_test) | |||
| # loop for each inner param tuple | |||
| for index_in, params_in in enumerate(param_list): | |||
| inner_cv = KFold( | |||
| n_splits=10, shuffle=True, random_state=trial) | |||
| current_train_perf = [] | |||
| current_valid_perf = [] | |||
| current_test_perf = [] | |||
| # For regression use the Kernel Ridge method | |||
| try: | |||
| if model_type == 'regression': | |||
| KR = KernelRidge(kernel='precomputed', **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split( | |||
| X_app): | |||
| KR.fit(X_app[train_index, :][:, train_index], | |||
| y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = KR.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = KR.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| y_pred_test = KR.predict( | |||
| X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_app[train_index], y_pred_train))) | |||
| current_valid_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_app[valid_index], y_pred_valid))) | |||
| current_test_perf.append( | |||
| np.sqrt( | |||
| mean_squared_error( | |||
| y_test, y_pred_test))) | |||
| # For clcassification use SVM | |||
| else: | |||
| KR = SVC(kernel='precomputed', **params_in) | |||
| # loop for each split on validation set level | |||
| # validation set level | |||
| for train_index, valid_index in inner_cv.split( | |||
| X_app): | |||
| KR.fit(X_app[train_index, :][:, train_index], | |||
| y_app[train_index]) | |||
| # predict on the train, validation and test set | |||
| y_pred_train = KR.predict( | |||
| X_app[train_index, :][:, train_index]) | |||
| y_pred_valid = KR.predict( | |||
| X_app[valid_index, :][:, train_index]) | |||
| y_pred_test = KR.predict( | |||
| X_test[:, train_index]) | |||
| # root mean squared errors | |||
| current_train_perf.append( | |||
| accuracy_score(y_app[train_index], | |||
| y_pred_train)) | |||
| current_valid_perf.append( | |||
| accuracy_score(y_app[valid_index], | |||
| y_pred_valid)) | |||
| current_test_perf.append( | |||
| accuracy_score(y_test, y_pred_test)) | |||
| except ValueError: | |||
| print(sys.exc_info()[0]) | |||
| print(params_out, params_in) | |||
| # average performance on inner splits | |||
| train_pref[index_out][index_in] = np.mean( | |||
| current_train_perf) | |||
| val_pref[index_out][index_in] = np.mean( | |||
| current_valid_perf) | |||
| test_pref[index_out][index_in] = np.mean( | |||
| current_test_perf) | |||
| fresults.close() | |||
| return train_pref, val_pref, test_pref | |||
| @@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None): | |||
| spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) | |||
| S = nx.Graph() | |||
| S.add_nodes_from(G.nodes(data=True)) | |||
| ns = list(G.nodes()) | |||
| for i in range(0, G.number_of_nodes()): | |||
| for j in range(i + 1, G.number_of_nodes()): | |||
| if spMatrix[i, j] != np.inf: | |||
| S.add_edge(i, j, cost=spMatrix[i, j]) | |||
| S.add_edge(ns[i], ns[j], cost=spMatrix[i, j]) | |||
| return S | |||