| @@ -12,7 +12,7 @@ import matplotlib.pyplot as plt | |||||
| from numpy.linalg import eig | from numpy.linalg import eig | ||||
| # read gram matrices from file. | # read gram matrices from file. | ||||
| results_dir = 'results/structuralspkernel/' | |||||
| results_dir = 'results/untilhpathkernel/myria' | |||||
| ds_name = 'Letter-med' | ds_name = 'Letter-med' | ||||
| gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | ||||
| #print('gm time: ', gmfile['gmtime']) | #print('gm time: ', gmfile['gmtime']) | ||||
| @@ -6,94 +6,116 @@ | |||||
| "metadata": { | "metadata": { | ||||
| "scrolled": false | "scrolled": false | ||||
| }, | }, | ||||
| "outputs": [], | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "\n", | |||||
| "MAO\n", | |||||
| "\n", | |||||
| "--- This is a classification problem ---\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "1. Loading dataset from file...\n", | |||||
| "\n", | |||||
| "2. Calculating gram matrices. This could take a while...\n", | |||||
| "\n", | |||||
| " None edge weight specified. Set all weight to 1.\n", | |||||
| "\n", | |||||
| "getting sp graphs: 68it [00:00, 692.11it/s]\n", | |||||
| "calculating kernels: 2346it [00:05, 399.28it/s]\n", | |||||
| "\n", | |||||
| " --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n", | |||||
| "\n", | |||||
| "the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n", | |||||
| "\n", | |||||
| "1 gram matrices are calculated, 0 of which are ignored.\n", | |||||
| "\n", | |||||
| "3. Fitting and predicting using nested cross validation. This could really take a while...\n", | |||||
| "cross validation: 7it [00:09, 4.67s/it]" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | "source": [ | ||||
| "%load_ext line_profiler\n", | |||||
| "%matplotlib inline\n", | |||||
| "import functools\n", | "import functools\n", | ||||
| "from libs import *\n", | "from libs import *\n", | ||||
| "import multiprocessing\n", | "import multiprocessing\n", | ||||
| "from sklearn.metrics.pairwise import rbf_kernel\n", | |||||
| "\n", | |||||
| "from pygraph.kernels.spKernel import spkernel, spkernel_do\n", | |||||
| "from pygraph.utils.kernels import deltakernel, kernelsum\n", | |||||
| "from pygraph.utils.model_selection_precomputed import trial_do\n", | |||||
| "\n", | |||||
| "dslist = [ \n", | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n", | |||||
| "# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n", | |||||
| "# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n", | |||||
| "# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n", | |||||
| "# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n", | |||||
| "# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||||
| "# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||||
| "# \n", | |||||
| "# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||||
| "# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||||
| "# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||||
| "\n", | |||||
| "# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||||
| "# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| "# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| "# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||||
| "# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||||
| " \n", | |||||
| "# # not working below\n", | |||||
| "# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||||
| "# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||||
| "# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||||
| "# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||||
| "from pygraph.kernels.spKernel import spkernel\n", | |||||
| "from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n", | |||||
| "#from pygraph.utils.model_selection_precomputed import trial_do\n", | |||||
| "\n", | |||||
| "dslist = [\n", | |||||
| "# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||||
| "# 'task': 'regression'}, # node symb\n", | |||||
| "# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||||
| "# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| "# # contains single node graph, node symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| "# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| "# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
| "# # node nsymb\n", | |||||
| "# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||||
| "# # node symb/nsymb\n", | |||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||||
| " # node/edge symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "\n", | |||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||||
| " #\n", | |||||
| " # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||||
| " # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||||
| " # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||||
| "\n", | |||||
| " # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||||
| " # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||||
| " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| " # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||||
| " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| " # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||||
| " # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||||
| "\n", | |||||
| " # # not working below\n", | |||||
| " # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||||
| " # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||||
| " # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||||
| " # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||||
| "]\n", | "]\n", | ||||
| "estimator = spkernel\n", | "estimator = spkernel\n", | ||||
| "mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n", | |||||
| "param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n", | |||||
| "param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n", | |||||
| " {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n", | |||||
| "mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n", | |||||
| "param_grid_precomputed = {'node_kernels': [\n", | |||||
| " {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n", | |||||
| "param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n", | |||||
| " {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n", | |||||
| "\n", | "\n", | ||||
| "for ds in dslist:\n", | "for ds in dslist:\n", | ||||
| " print()\n", | " print()\n", | ||||
| " print(ds['name'])\n", | " print(ds['name'])\n", | ||||
| " model_selection_for_precomputed_kernel(\n", | " model_selection_for_precomputed_kernel(\n", | ||||
| " ds['dataset'], \n", | |||||
| " estimator, \n", | |||||
| " param_grid_precomputed, \n", | |||||
| " (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n", | |||||
| " (ds['task'] if 'task' in ds else 'classification'), \n", | |||||
| " ds['dataset'],\n", | |||||
| " estimator,\n", | |||||
| " param_grid_precomputed,\n", | |||||
| " (param_grid[1] if ('task' in ds and ds['task']\n", | |||||
| " == 'regression') else param_grid[0]),\n", | |||||
| " (ds['task'] if 'task' in ds else 'classification'),\n", | |||||
| " NUM_TRIALS=30,\n", | " NUM_TRIALS=30,\n", | ||||
| " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | ||||
| " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | ||||
| " ds_name=ds['name'],\n", | " ds_name=ds['name'],\n", | ||||
| " n_jobs=multiprocessing.cpu_count())\n", | |||||
| " \n", | |||||
| "# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n", | |||||
| "# model_selection_for_precomputed_kernel( \\\n", | |||||
| "# ds['dataset'], \\\n", | |||||
| "# estimator, \\\n", | |||||
| "# param_grid_precomputed, \\\n", | |||||
| "# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n", | |||||
| "# (ds['task'] if 'task' in ds else 'classification'), \\\n", | |||||
| "# NUM_TRIALS=30, \\\n", | |||||
| "# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n", | |||||
| "# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n", | |||||
| "# ds_name=ds['name'], \\\n", | |||||
| "# n_jobs=multiprocessing.cpu_count()) \n", | |||||
| " print()" | |||||
| " n_jobs=multiprocessing.cpu_count(),\n", | |||||
| " read_gm_from_file=False)\n", | |||||
| " print()\n" | |||||
| ] | ] | ||||
| }, | }, | ||||
| { | { | ||||
| @@ -713,8 +735,8 @@ | |||||
| ], | ], | ||||
| "metadata": { | "metadata": { | ||||
| "kernelspec": { | "kernelspec": { | ||||
| "display_name": "Python 3 (Spyder)", | |||||
| "language": "python3", | |||||
| "display_name": "Python 3", | |||||
| "language": "python", | |||||
| "name": "python3" | "name": "python3" | ||||
| }, | }, | ||||
| "language_info": { | "language_info": { | ||||
| @@ -727,7 +749,7 @@ | |||||
| "name": "python", | "name": "python", | ||||
| "nbconvert_exporter": "python", | "nbconvert_exporter": "python", | ||||
| "pygments_lexer": "ipython3", | "pygments_lexer": "ipython3", | ||||
| "version": "3.5.2" | |||||
| "version": "3.6.6" | |||||
| } | } | ||||
| }, | }, | ||||
| "nbformat": 4, | "nbformat": 4, | ||||
| @@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| #from pygraph.utils.model_selection_precomputed import trial_do | #from pygraph.utils.model_selection_precomputed import trial_do | ||||
| dslist = [ | dslist = [ | ||||
| # {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
| # 'task': 'regression'}, # node symb | |||||
| # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
| # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| # # contains single node graph, node symb | |||||
| # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
| 'task': 'regression'}, # node symb | |||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| # contains single node graph, node symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | |||||
| # node/edge symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | ||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | ||||
| @@ -56,7 +56,7 @@ estimator = spkernel | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
| param_grid_precomputed = {'node_kernels': [ | param_grid_precomputed = {'node_kernels': [ | ||||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} | ||||
| param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)}, | |||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| for ds in dslist: | for ds in dslist: | ||||
| @@ -23,10 +23,10 @@ dslist = [ | |||||
| # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | ||||
| # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | ||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | ||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
| # node nsymb | |||||
| # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
| # # node symb/nsymb | |||||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
| # # node nsymb | |||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
| # node symb/nsymb | |||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | ||||
| @@ -39,8 +39,8 @@ dslist = [ | |||||
| # | # | ||||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | ||||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | ||||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||||
| # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||||
| # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | ||||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -53,8 +53,8 @@ dslist = [ | |||||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | ||||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | ||||
| # # not working below | |||||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
| # # not working below | |||||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
| # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | ||||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | ||||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
| @@ -62,7 +62,7 @@ dslist = [ | |||||
| ] | ] | ||||
| estimator = untilhpathkernel | estimator = untilhpathkernel | ||||
| mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | ||||
| param_grid_precomputed = {'depth': np.linspace(7, 10, 10), | |||||
| param_grid_precomputed = {'depth': np.linspace(1, 10, 10), | |||||
| 'k_func': ['tanimoto', 'MinMax']} | 'k_func': ['tanimoto', 'MinMax']} | ||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| @@ -1,77 +0,0 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Fri Sep 28 16:37:29 2018 | |||||
| @author: ljia | |||||
| """ | |||||
| import functools | |||||
| from libs import * | |||||
| import multiprocessing | |||||
| from sklearn.metrics.pairwise import rbf_kernel | |||||
| from pygraph.kernels.structuralspKernel import structuralspkernel | |||||
| from pygraph.utils.kernels import deltakernel, kernelproduct | |||||
| dslist = [ | |||||
| # {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
| # 'task': 'regression'}, # node symb | |||||
| # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
| # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
| # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
| # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
| # | |||||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
| # # not working below | |||||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
| # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
| ] | |||||
| estimator = structuralspkernel | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||||
| param_grid_precomputed = {'node_kernels': | |||||
| [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}], | |||||
| 'edge_kernels': | |||||
| [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||||
| for ds in dslist: | |||||
| print() | |||||
| print(ds['name']) | |||||
| model_selection_for_precomputed_kernel( | |||||
| ds['dataset'], | |||||
| estimator, | |||||
| param_grid_precomputed, | |||||
| (param_grid[1] if ('task' in ds and ds['task'] | |||||
| == 'regression') else param_grid[0]), | |||||
| (ds['task'] if 'task' in ds else 'classification'), | |||||
| NUM_TRIALS=30, | |||||
| datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
| ds_name=ds['name'], | |||||
| n_jobs=multiprocessing.cpu_count(), | |||||
| read_gm_from_file=False) | |||||
| print() | |||||
| @@ -85,21 +85,20 @@ def commonwalkkernel(*args, | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| itr = zip(combinations_with_replacement(Gn, 2), | |||||
| combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
| if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| else: | else: | ||||
| chunksize = 100 | |||||
| chunksize = 1000 | |||||
| # direct product graph method - exponential | # direct product graph method - exponential | ||||
| if compute_method == 'exp': | if compute_method == 'exp': | ||||
| do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label, | |||||
| weight) | |||||
| do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
| # direct product graph method - geometric | # direct product graph method - geometric | ||||
| elif compute_method == 'geo': | elif compute_method == 'geo': | ||||
| do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label, | |||||
| weight) | |||||
| do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
| for i, j, kernel in tqdm( | for i, j, kernel in tqdm( | ||||
| pool.imap_unordered(do_partial, itr, chunksize), | pool.imap_unordered(do_partial, itr, chunksize), | ||||
| @@ -153,7 +152,7 @@ def commonwalkkernel(*args, | |||||
| return Kmatrix, run_time | return Kmatrix, run_time | ||||
| def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
| def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | |||||
| """Calculate walk graph kernels up to n between 2 graphs using exponential | """Calculate walk graph kernels up to n between 2 graphs using exponential | ||||
| series. | series. | ||||
| @@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
| kernel : float | kernel : float | ||||
| The common walk Kernel between 2 graphs. | The common walk Kernel between 2 graphs. | ||||
| """ | """ | ||||
| iglobal = ij[0] | |||||
| jglobal = ij[1] | |||||
| g1 = Gn[iglobal] | |||||
| g2 = Gn[jglobal] | |||||
| # get tensor product / direct product | # get tensor product / direct product | ||||
| gp = direct_product(g1, g2, node_label, edge_label) | gp = direct_product(g1, g2, node_label, edge_label) | ||||
| @@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
| # print(np.exp(weight * A)) | # print(np.exp(weight * A)) | ||||
| # print('-------') | # print('-------') | ||||
| return iglobal, jglobal, exp_D.sum() | |||||
| return exp_D.sum() | |||||
| def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||||
| def wrapper_cw_exp(node_label, edge_label, beta, itr_item): | |||||
| g1 = itr_item[0][0] | |||||
| g2 = itr_item[0][1] | |||||
| i = itr_item[1][0] | |||||
| j = itr_item[1][1] | |||||
| return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta) | |||||
| def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | |||||
| """Calculate common walk graph kernels up to n between 2 graphs using | """Calculate common walk graph kernels up to n between 2 graphs using | ||||
| geometric series. | geometric series. | ||||
| @@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||||
| kernel : float | kernel : float | ||||
| The common walk Kernel between 2 graphs. | The common walk Kernel between 2 graphs. | ||||
| """ | """ | ||||
| iglobal = ij[0] | |||||
| jglobal = ij[1] | |||||
| g1 = Gn[iglobal] | |||||
| g2 = Gn[jglobal] | |||||
| # get tensor product / direct product | # get tensor product / direct product | ||||
| gp = direct_product(g1, g2, node_label, edge_label) | gp = direct_product(g1, g2, node_label, edge_label) | ||||
| A = nx.adjacency_matrix(gp).todense() | A = nx.adjacency_matrix(gp).todense() | ||||
| mat = np.identity(len(A)) - gamma * A | mat = np.identity(len(A)) - gamma * A | ||||
| try: | try: | ||||
| return iglobal, jglobal, mat.I.sum() | |||||
| return mat.I.sum() | |||||
| except np.linalg.LinAlgError: | except np.linalg.LinAlgError: | ||||
| return iglobal, jglobal, np.nan | |||||
| return np.nan | |||||
| def wrapper_cw_geo(node_label, edge_label, gama, itr_item): | |||||
| g1 = itr_item[0][0] | |||||
| g2 = itr_item[0][1] | |||||
| i = itr_item[1][0] | |||||
| j = itr_item[1][1] | |||||
| return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama) | |||||
| def _commonwalkkernel_brute(walks1, | def _commonwalkkernel_brute(walks1, | ||||
| @@ -8,7 +8,6 @@ import sys | |||||
| import time | import time | ||||
| from itertools import combinations_with_replacement, product | from itertools import combinations_with_replacement, product | ||||
| from functools import partial | from functools import partial | ||||
| from joblib import Parallel, delayed | |||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| @@ -89,7 +88,8 @@ def spkernel(*args, | |||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| # get shortest path graphs of Gn | # get shortest path graphs of Gn | ||||
| getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||||
| getsp_partial = partial(wrapper_getSPGraph, weight) | |||||
| itr = zip(Gn, range(0, len(Gn))) | |||||
| if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
| # # use default chunksize as pool.map when iterable is less than 100 | # # use default chunksize as pool.map when iterable is less than 100 | ||||
| # chunksize, extra = divmod(len(Gn), n_jobs * 4) | # chunksize, extra = divmod(len(Gn), n_jobs * 4) | ||||
| @@ -98,9 +98,8 @@ def spkernel(*args, | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
| else: | else: | ||||
| chunksize = 1000 | chunksize = 1000 | ||||
| # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
| for i, g in tqdm( | for i, g in tqdm( | ||||
| pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||||
| pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
| desc='getting sp graphs', file=sys.stdout): | desc='getting sp graphs', file=sys.stdout): | ||||
| Gn[i] = g | Gn[i] = g | ||||
| pool.close() | pool.close() | ||||
| @@ -144,8 +143,9 @@ def spkernel(*args, | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||||
| itr = zip(combinations_with_replacement(Gn, 2), | |||||
| combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
| if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| @@ -200,15 +200,10 @@ def spkernel(*args, | |||||
| return Kmatrix, run_time, idx | return Kmatrix, run_time, idx | ||||
| def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
| i = ij[0] | |||||
| j = ij[1] | |||||
| g1 = Gn[i] | |||||
| g2 = Gn[j] | |||||
| def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||||
| kernel = 0 | kernel = 0 | ||||
| # try: | |||||
| # compute shortest path matrices first, method borrowed from FCSP. | # compute shortest path matrices first, method borrowed from FCSP. | ||||
| if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
| # node symb and non-synb labeled | # node symb and non-synb labeled | ||||
| @@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
| g1.edges(data=True), g2.edges(data=True)): | g1.edges(data=True), g2.edges(data=True)): | ||||
| if e1[2]['cost'] == e2[2]['cost']: | if e1[2]['cost'] == e2[2]['cost']: | ||||
| kernel += 1 | kernel += 1 | ||||
| return i, j, kernel | |||||
| return kernel | |||||
| # compute graph kernels | # compute graph kernels | ||||
| if ds_attrs['is_directed']: | if ds_attrs['is_directed']: | ||||
| @@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
| # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | ||||
| # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | ||||
| # kernel += kn1 + kn2 | # kernel += kn1 + kn2 | ||||
| # except KeyError: # missing labels or attributes | |||||
| # pass | |||||
| return i, j, kernel | |||||
| return kernel | |||||
| def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | |||||
| g1 = itr_item[0][0] | |||||
| g2 = itr_item[0][1] | |||||
| i = itr_item[1][0] | |||||
| j = itr_item[1][1] | |||||
| return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||||
| def wrap_getSPGraph(Gn, weight, i): | |||||
| return i, getSPGraph(Gn[i], edge_weight=weight) | |||||
| # return i, nx.floyd_warshall_numpy(Gn[i], weight=weight) | |||||
| def wrapper_getSPGraph(weight, itr_item): | |||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| return i, getSPGraph(g, edge_weight=weight) | |||||
| # return i, nx.floyd_warshall_numpy(g, weight=weight) | |||||
| @@ -12,7 +12,6 @@ import sys | |||||
| import time | import time | ||||
| from itertools import combinations, combinations_with_replacement, product | from itertools import combinations, combinations_with_replacement, product | ||||
| from functools import partial | from functools import partial | ||||
| from joblib import Parallel, delayed | |||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| @@ -71,7 +70,6 @@ def structuralspkernel(*args, | |||||
| """ | """ | ||||
| # pre-process | # pre-process | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| weight = None | weight = None | ||||
| if edge_weight is None: | if edge_weight is None: | ||||
| print('\n None edge weight specified. Set all weight to 1.\n') | print('\n None edge weight specified. Set all weight to 1.\n') | ||||
| @@ -98,34 +96,61 @@ def structuralspkernel(*args, | |||||
| start_time = time.time() | start_time = time.time() | ||||
| # get shortest paths of each graph in Gn | # get shortest paths of each graph in Gn | ||||
| splist = [[] for _ in range(len(Gn))] | |||||
| splist = [None] * len(Gn) | |||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| # get shortest path graphs of Gn | # get shortest path graphs of Gn | ||||
| getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed']) | |||||
| getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
| itr = zip(Gn, range(0, len(Gn))) | |||||
| if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
| chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
| else: | else: | ||||
| chunksize = 1000 | chunksize = 1000 | ||||
| # chunksize = 300 # int(len(list(itr)) / n_jobs) | # chunksize = 300 # int(len(list(itr)) / n_jobs) | ||||
| for i, sp in tqdm( | for i, sp in tqdm( | ||||
| pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||||
| pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
| desc='getting shortest paths', | desc='getting shortest paths', | ||||
| file=sys.stdout): | file=sys.stdout): | ||||
| splist[i] = sp | splist[i] = sp | ||||
| # time.sleep(10) | |||||
| pool.close() | pool.close() | ||||
| pool.join() | pool.join() | ||||
| # # ---- use pool.map to parallel ---- | |||||
| # result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||||
| # for i in result_sp: | |||||
| # Gn[i[0]] = i[1] | |||||
| # or | |||||
| # getsp_partial = partial(wrap_getSP, Gn, weight) | |||||
| # for i, g in tqdm( | |||||
| # pool.map(getsp_partial, range(0, len(Gn))), | |||||
| # desc='getting sp graphs', | |||||
| # file=sys.stdout): | |||||
| # Gn[i] = g | |||||
| # # get shortest paths of each graph in Gn | |||||
| # splist = [[] for _ in range(len(Gn))] | |||||
| # # get shortest path graphs of Gn | |||||
| # getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
| # itr = zip(Gn, range(0, len(Gn))) | |||||
| # if len(Gn) < 1000 * n_jobs: | |||||
| # chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 1000 | |||||
| # # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
| # from contextlib import closing | |||||
| # with closing(Pool(n_jobs)) as pool: | |||||
| ## for i, sp in tqdm( | |||||
| # res = pool.imap_unordered(getsp_partial, itr, 10) | |||||
| ## desc='getting shortest paths', | |||||
| ## file=sys.stdout): | |||||
| ## splist[i] = sp | |||||
| ## time.sleep(10) | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # ss = 0 | |||||
| # ss += sys.getsizeof(splist) | |||||
| # for spss in splist: | |||||
| # ss += sys.getsizeof(spss) | |||||
| # for spp in spss: | |||||
| # ss += sys.getsizeof(spp) | |||||
| # time.sleep(20) | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # splist = [] | |||||
| # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): | |||||
| # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) | |||||
| # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | ||||
| # sp_ml = [0] * len(Gn) # shortest path matrices | # sp_ml = [0] * len(Gn) # shortest path matrices | ||||
| @@ -149,9 +174,11 @@ def structuralspkernel(*args, | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs, | |||||
| node_label, edge_label, node_kernels, edge_kernels) | |||||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
| node_kernels, edge_kernels) | |||||
| itr = zip(combinations_with_replacement(Gn, 2), | |||||
| combinations_with_replacement(splist, 2), | |||||
| combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
| if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| @@ -166,36 +193,36 @@ def structuralspkernel(*args, | |||||
| pool.close() | pool.close() | ||||
| pool.join() | pool.join() | ||||
| # # ---- use pool.map to parallel. ---- | |||||
| # # result_perf = pool.map(do_partial, itr) | |||||
| # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.map(do_partial, itr), desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
| # result_perf = Parallel( | |||||
| # n_jobs=n_jobs, verbose=10)( | |||||
| # delayed(do_partial)(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # result_perf = [ | |||||
| # do_partial(ij) | |||||
| # for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # ] | |||||
| # for i in result_perf: | |||||
| # Kmatrix[i[0]][i[1]] = i[2] | |||||
| # Kmatrix[i[1]][i[0]] = i[2] | |||||
| # # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
| # node_kernels, edge_kernels) | |||||
| # itr = zip(combinations_with_replacement(Gn, 2), | |||||
| # combinations_with_replacement(splist, 2), | |||||
| # combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| # if len_itr < 1000 * n_jobs: | |||||
| # chunksize = int(len_itr / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 1000 | |||||
| # from contextlib import closing | |||||
| # with closing(Pool(n_jobs)) as pool: | |||||
| # for i, j, kernel in tqdm( | |||||
| # pool.imap_unordered(do_partial, itr, 1000), | |||||
| # desc='calculating kernels', | |||||
| # file=sys.stdout): | |||||
| # Kmatrix[i][j] = kernel | |||||
| # Kmatrix[j][i] = kernel | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | # # ---- direct running, normally use single CPU core. ---- | ||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # itr = zip(combinations_with_replacement(Gn, 2), | |||||
| # combinations_with_replacement(splist, 2), | |||||
| # combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): | # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): | ||||
| # i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs, | |||||
| # node_label, edge_label, node_kernels, edge_kernels, gs) | |||||
| # i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label, | |||||
| # node_kernels, edge_kernels, gs) | |||||
| # if(kernel > 1): | # if(kernel > 1): | ||||
| # print("error here ") | # print("error here ") | ||||
| # Kmatrix[i][j] = kernel | # Kmatrix[i][j] = kernel | ||||
| @@ -209,18 +236,11 @@ def structuralspkernel(*args, | |||||
| return Kmatrix, run_time | return Kmatrix, run_time | ||||
| def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||||
| node_kernels, edge_kernels, ij): | |||||
| iglobal = ij[0] | |||||
| jglobal = ij[1] | |||||
| g1 = Gn[iglobal] | |||||
| g2 = Gn[jglobal] | |||||
| spl1 = splist[iglobal] | |||||
| spl2 = splist[jglobal] | |||||
| def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, | |||||
| node_kernels, edge_kernels): | |||||
| kernel = 0 | kernel = 0 | ||||
| #try: | |||||
| # First, compute shortest path matrices, method borrowed from FCSP. | # First, compute shortest path matrices, method borrowed from FCSP. | ||||
| if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
| # node symb and non-synb labeled | # node symb and non-synb labeled | ||||
| @@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||||
| # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | ||||
| # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | ||||
| # Kmatrix += kn1 + kn2 | # Kmatrix += kn1 + kn2 | ||||
| #except KeyError: # missing labels or attributes | |||||
| # print("toto") | |||||
| # pass | |||||
| return kernel | |||||
| return iglobal, jglobal, kernel | |||||
| def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, | |||||
| edge_kernels, itr_item): | |||||
| g1 = itr_item[0][0] | |||||
| g2 = itr_item[0][1] | |||||
| spl1 = itr_item[1][0] | |||||
| spl2 = itr_item[1][1] | |||||
| i = itr_item[2][0] | |||||
| j = itr_item[2][1] | |||||
| return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, | |||||
| node_label, edge_label, node_kernels, edge_kernels) | |||||
| def get_shortest_paths(G, weight, directed): | def get_shortest_paths(G, weight, directed): | ||||
| @@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed): | |||||
| for n1, n2 in combinations(G.nodes(), 2): | for n1, n2 in combinations(G.nodes(), 2): | ||||
| try: | try: | ||||
| spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | ||||
| except nx.NetworkXNoPath: # nodes not connected | |||||
| # sp.append([]) | |||||
| pass | |||||
| else: | |||||
| sp += spltemp | sp += spltemp | ||||
| # each edge walk is counted twice, starting from both its extreme nodes. | # each edge walk is counted twice, starting from both its extreme nodes. | ||||
| if not directed: | if not directed: | ||||
| sp += [sptemp[::-1] for sptemp in spltemp] | sp += [sptemp[::-1] for sptemp in spltemp] | ||||
| except nx.NetworkXNoPath: # nodes not connected | |||||
| # sp.append([]) | |||||
| pass | |||||
| # add single nodes as length 0 paths. | # add single nodes as length 0 paths. | ||||
| sp += [[n] for n in G.nodes()] | sp += [[n] for n in G.nodes()] | ||||
| return sp | return sp | ||||
| def wrap_getSP(Gn, weight, directed, i): | |||||
| return i, get_shortest_paths(Gn[i], weight, directed) | |||||
| def wrapper_getSP(weight, directed, itr_item): | |||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| return i, get_shortest_paths(g, weight, directed) | |||||
| @@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement | |||||
| from functools import partial | from functools import partial | ||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| import traceback | |||||
| import networkx as nx | import networkx as nx | ||||
| import numpy as np | import numpy as np | ||||
| @@ -77,15 +76,15 @@ def untilhpathkernel(*args, | |||||
| # but this may cost a lot of memory for large datasets. | # but this may cost a lot of memory for large datasets. | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| all_paths = [[] for _ in range(len(Gn))] | all_paths = [[] for _ in range(len(Gn))] | ||||
| getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth, | |||||
| getps_partial = partial(wrapper_find_all_paths_until_length, depth, | |||||
| ds_attrs, node_label, edge_label) | ds_attrs, node_label, edge_label) | ||||
| itr = zip(Gn, range(0, len(Gn))) | |||||
| if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
| chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
| else: | else: | ||||
| chunksize = 1000 | chunksize = 1000 | ||||
| # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
| for i, ps in tqdm( | for i, ps in tqdm( | ||||
| pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize), | |||||
| pool.imap_unordered(getps_partial, itr, chunksize), | |||||
| desc='getting paths', file=sys.stdout): | desc='getting paths', file=sys.stdout): | ||||
| all_paths[i] = ps | all_paths[i] = ps | ||||
| pool.close() | pool.close() | ||||
| @@ -110,8 +109,9 @@ def untilhpathkernel(*args, | |||||
| pass | pass | ||||
| else: | else: | ||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func) | |||||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| do_partial = partial(wrapper_uhpath_do_naive, k_func) | |||||
| itr = zip(combinations_with_replacement(all_paths, 2), | |||||
| combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
| if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| @@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func): | |||||
| return kernel | return kernel | ||||
| def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
| def _untilhpathkernel_do_naive(paths1, paths2, k_func): | |||||
| """Calculate path graph kernels up to depth d between 2 graphs naively. | """Calculate path graph kernels up to depth d between 2 graphs naively. | ||||
| Parameters | Parameters | ||||
| @@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
| kernel : float | kernel : float | ||||
| Path kernel up to h between 2 graphs. | Path kernel up to h between 2 graphs. | ||||
| """ | """ | ||||
| iglobal = ij[0] | |||||
| jglobal = ij[1] | |||||
| paths1 = paths_list[iglobal] | |||||
| paths2 = paths_list[jglobal] | |||||
| all_paths = list(set(paths1 + paths2)) | all_paths = list(set(paths1 + paths2)) | ||||
| if k_func == 'tanimoto': | if k_func == 'tanimoto': | ||||
| @@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
| kernel = np.sum(np.minimum(vector1, vector2)) / \ | kernel = np.sum(np.minimum(vector1, vector2)) / \ | ||||
| np.sum(np.maximum(vector1, vector2)) | np.sum(np.maximum(vector1, vector2)) | ||||
| return iglobal, jglobal, kernel | |||||
| return kernel | |||||
| # @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||||
| def wrapper_uhpath_do_naive(k_func, itr_item): | |||||
| plist1 = itr_item[0][0] | |||||
| plist2 = itr_item[0][1] | |||||
| i = itr_item[1][0] | |||||
| j = itr_item[1][1] | |||||
| return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func) | |||||
| # @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||||
| def find_all_paths_until_length(G, | def find_all_paths_until_length(G, | ||||
| length, | length, | ||||
| ds_attrs, | ds_attrs, | ||||
| @@ -368,15 +370,12 @@ def find_all_paths_until_length(G, | |||||
| return [tuple([len(path)]) for path in all_paths] | return [tuple([len(path)]) for path in all_paths] | ||||
| def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label, | |||||
| edge_label, i): | |||||
| try: | |||||
| return i, find_all_paths_until_length(Gn[i], length, ds_attrs, | |||||
| def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, | |||||
| edge_label, itr_item): | |||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| return i, find_all_paths_until_length(g, length, ds_attrs, | |||||
| node_label=node_label, edge_label=edge_label) | node_label=node_label, edge_label=edge_label) | ||||
| except Exception as e: | |||||
| traceback.print_exc() | |||||
| print('') | |||||
| raise e | |||||
| def paths2GSuffixTree(paths): | def paths2GSuffixTree(paths): | ||||
| @@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
| '3. Fitting and predicting using nested cross validation. This could really take a while...' | '3. Fitting and predicting using nested cross validation. This could really take a while...' | ||||
| ) | ) | ||||
| # pool = Pool(n_jobs) | |||||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
| # train_pref = [] | |||||
| # val_pref = [] | |||||
| # test_pref = [] | |||||
| ## if NUM_TRIALS < 1000 * n_jobs: | |||||
| ## chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||||
| ## else: | |||||
| ## chunksize = 1000 | |||||
| # chunksize = 1 | |||||
| # for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
| # train_pref.append(o1) | |||||
| # val_pref.append(o2) | |||||
| # test_pref.append(o3) | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # ---- use pool.map to parallel. ---- | |||||
| pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
| trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | ||||
| train_pref = [] | |||||
| val_pref = [] | |||||
| test_pref = [] | |||||
| # if NUM_TRIALS < 100: | |||||
| # chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
| # if extra: | |||||
| # chunksize += 1 | |||||
| # else: | |||||
| # chunksize = 100 | |||||
| chunksize = 1 | |||||
| for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
| train_pref.append(o1) | |||||
| val_pref.append(o2) | |||||
| test_pref.append(o3) | |||||
| pool.close() | |||||
| pool.join() | |||||
| # # ---- use pool.map to parallel. ---- | |||||
| # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
| # train_pref = [item[0] for item in result_perf] | |||||
| # val_pref = [item[1] for item in result_perf] | |||||
| # test_pref = [item[2] for item in result_perf] | |||||
| result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
| train_pref = [item[0] for item in result_perf] | |||||
| val_pref = [item[1] for item in result_perf] | |||||
| test_pref = [item[2] for item in result_perf] | |||||
| # # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
| # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||||
| # train_pref = [item[0] for item in result_perf] | |||||
| # val_pref = [item[1] for item in result_perf] | |||||
| # test_pref = [item[2] for item in result_perf] | |||||
| # # ---- direct running, normally use a single CPU core. ---- | |||||
| # train_pref = [] | |||||
| # val_pref = [] | |||||
| # test_pref = [] | |||||
| # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
| # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
| # train_pref.append(o1) | |||||
| # val_pref.append(o2) | |||||
| # test_pref.append(o3) | |||||
| # # ---- direct running, normally use a single CPU core. ---- | |||||
| # train_pref = [] | |||||
| # val_pref = [] | |||||
| # test_pref = [] | |||||
| # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
| # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
| # train_pref.append(o1) | |||||
| # val_pref.append(o2) | |||||
| # test_pref.append(o3) | |||||
| # print() | |||||
| print() | print() | ||||
| print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
| str_fw += '\nIII. Performance.\n\n' | str_fw += '\nIII. Performance.\n\n' | ||||
| # averages and confidences of performances on outer trials for each combination of parameters | # averages and confidences of performances on outer trials for each combination of parameters | ||||
| average_train_scores = np.mean(train_pref, axis=0) | average_train_scores = np.mean(train_pref, axis=0) | ||||
| # print('val_pref: ', val_pref[0][0]) | |||||
| average_val_scores = np.mean(val_pref, axis=0) | average_val_scores = np.mean(val_pref, axis=0) | ||||
| # print('test_pref: ', test_pref[0][0]) | |||||
| average_perf_scores = np.mean(test_pref, axis=0) | average_perf_scores = np.mean(test_pref, axis=0) | ||||
| # sample std is used here | # sample std is used here | ||||
| std_train_scores = np.std(train_pref, axis=0, ddof=1) | std_train_scores = np.std(train_pref, axis=0, ddof=1) | ||||
| @@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
| best_val_perf = np.amin(average_val_scores) | best_val_perf = np.amin(average_val_scores) | ||||
| else: | else: | ||||
| best_val_perf = np.amax(average_val_scores) | best_val_perf = np.amax(average_val_scores) | ||||
| # print('average_val_scores: ', average_val_scores) | |||||
| # print('best_val_perf: ', best_val_perf) | |||||
| # print() | |||||
| best_params_index = np.where(average_val_scores == best_val_perf) | best_params_index = np.where(average_val_scores == best_val_perf) | ||||
| # find smallest val std with best val perf. | # find smallest val std with best val perf. | ||||
| best_val_stds = [ | best_val_stds = [ | ||||
| @@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
| str_fw += 'best_val_perf: %s\n' % best_val_perf | str_fw += 'best_val_perf: %s\n' % best_val_perf | ||||
| str_fw += 'best_val_std: %s\n' % min_val_std | str_fw += 'best_val_std: %s\n' % min_val_std | ||||
| # print(best_params_index) | |||||
| # print(best_params_index[0]) | |||||
| # print(average_perf_scores) | |||||
| final_performance = [ | final_performance = [ | ||||
| average_perf_scores[value][best_params_index[1][idx]] | average_perf_scores[value][best_params_index[1][idx]] | ||||
| for idx, value in enumerate(best_params_index[0]) | for idx, value in enumerate(best_params_index[0]) | ||||
| @@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
| '3. Fitting and predicting using nested cross validation. This could really take a while...' | '3. Fitting and predicting using nested cross validation. This could really take a while...' | ||||
| ) | ) | ||||
| # pool = Pool(n_jobs) | |||||
| # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
| # train_pref = [] | |||||
| # val_pref = [] | |||||
| # test_pref = [] | |||||
| # if NUM_TRIALS < 100: | |||||
| # chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
| # if extra: | |||||
| # chunksize += 1 | |||||
| # else: | |||||
| # chunksize = 100 | |||||
| # for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
| # train_pref.append(o1) | |||||
| # val_pref.append(o2) | |||||
| # test_pref.append(o3) | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| pool = Pool(n_jobs) | |||||
| trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
| train_pref = [] | |||||
| val_pref = [] | |||||
| test_pref = [] | |||||
| if NUM_TRIALS < 100: | |||||
| chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
| if extra: | |||||
| chunksize += 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
| train_pref.append(o1) | |||||
| val_pref.append(o2) | |||||
| test_pref.append(o3) | |||||
| pool.close() | |||||
| pool.join() | |||||
| # # ---- use pool.map to parallel. ---- | # # ---- use pool.map to parallel. ---- | ||||
| # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | ||||
| @@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
| # val_pref = [item[1] for item in result_perf] | # val_pref = [item[1] for item in result_perf] | ||||
| # test_pref = [item[2] for item in result_perf] | # test_pref = [item[2] for item in result_perf] | ||||
| # ---- direct running, normally use a single CPU core. ---- | |||||
| train_pref = [] | |||||
| val_pref = [] | |||||
| test_pref = [] | |||||
| for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
| o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
| train_pref.append(o1) | |||||
| val_pref.append(o2) | |||||
| test_pref.append(o3) | |||||
| # # ---- direct running, normally use a single CPU core. ---- | |||||
| # train_pref = [] | |||||
| # val_pref = [] | |||||
| # test_pref = [] | |||||
| # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
| # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
| # train_pref.append(o1) | |||||
| # val_pref.append(o2) | |||||
| # test_pref.append(o3) | |||||
| print() | print() | ||||
| print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
| @@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||||
| val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | ||||
| test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | ||||
| # randomness added to seeds of split function below. "high" is "size" times | |||||
| # 10 so that at least 10 different random output will be yielded. Remove | |||||
| # these lines if identical outputs is required. | |||||
| rdm_out = np.random.RandomState(seed=None) | |||||
| rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||||
| size=len(param_list_pre_revised)) | |||||
| # print(trial, rdm_seed_out_l) | |||||
| # print() | |||||
| # loop for each outer param tuple | # loop for each outer param tuple | ||||
| for index_out, params_out in enumerate(param_list_pre_revised): | for index_out, params_out in enumerate(param_list_pre_revised): | ||||
| # split gram matrix and y to app and test sets. | # split gram matrix and y to app and test sets. | ||||
| indices = range(len(y)) | indices = range(len(y)) | ||||
| # The argument "random_state" in function "train_test_split" can not be | |||||
| # set to None, because it will use RandomState instance used by | |||||
| # np.random, which is possible for multiple subprocesses to inherit the | |||||
| # same seed if they forked at the same time, leading to identical | |||||
| # random variates for different subprocesses. Instead, we use "trial" | |||||
| # and "index_out" parameters to generate different seeds for different | |||||
| # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||||
| # randomness into seeds, so that it yields a different output every | |||||
| # time the program is run. To yield identical outputs every time, | |||||
| # remove the second line below. Same method is used to the "KFold" | |||||
| # function in the inner loop. | |||||
| rdm_seed_out = (trial + 1) * (index_out + 1) | |||||
| rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||||
| # print(trial, rdm_seed_out) | |||||
| X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | ||||
| gram_matrices[index_out], y, indices, test_size=0.1, | gram_matrices[index_out], y, indices, test_size=0.1, | ||||
| random_state=None, shuffle=True) | |||||
| random_state=rdm_seed_out, shuffle=True) | |||||
| # print(trial, idx_app, idx_test) | |||||
| # print() | |||||
| X_app = X_app[:, idx_app] | X_app = X_app[:, idx_app] | ||||
| X_test = X_test[:, idx_app] | X_test = X_test[:, idx_app] | ||||
| y_app = np.array(y_app) | y_app = np.array(y_app) | ||||
| y_test = np.array(y_test) | y_test = np.array(y_test) | ||||
| rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||||
| size=len(param_list)) | |||||
| # loop for each inner param tuple | # loop for each inner param tuple | ||||
| for index_in, params_in in enumerate(param_list): | for index_in, params_in in enumerate(param_list): | ||||
| # print(index_in, params_in) | |||||
| # if trial == 0: | |||||
| # print(index_out, index_in) | |||||
| # print('params_in: ', params_in) | |||||
| # st = time.time() | # st = time.time() | ||||
| inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
| rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||||
| # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||||
| rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||||
| # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||||
| inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||||
| current_train_perf = [] | current_train_perf = [] | ||||
| current_valid_perf = [] | current_valid_perf = [] | ||||
| current_test_perf = [] | current_test_perf = [] | ||||
| # For regression use the Kernel Ridge method | # For regression use the Kernel Ridge method | ||||
| try: | |||||
| if model_type == 'regression': | |||||
| kr = KernelRidge(kernel='precomputed', **params_in) | |||||
| # loop for each split on validation set level | |||||
| # validation set level | |||||
| for train_index, valid_index in inner_cv.split(X_app): | |||||
| kr.fit(X_app[train_index, :][:, train_index], | |||||
| y_app[train_index]) | |||||
| # try: | |||||
| if model_type == 'regression': | |||||
| kr = KernelRidge(kernel='precomputed', **params_in) | |||||
| # loop for each split on validation set level | |||||
| # validation set level | |||||
| for train_index, valid_index in inner_cv.split(X_app): | |||||
| # print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||||
| # if trial == 0: | |||||
| # print('train_index: ', train_index) | |||||
| # print('valid_index: ', valid_index) | |||||
| # print('idx_test: ', idx_test) | |||||
| # print('y_app[train_index]: ', y_app[train_index]) | |||||
| # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
| # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
| kr.fit(X_app[train_index, :][:, train_index], | |||||
| y_app[train_index]) | |||||
| # predict on the train, validation and test set | |||||
| y_pred_train = kr.predict( | |||||
| X_app[train_index, :][:, train_index]) | |||||
| y_pred_valid = kr.predict( | |||||
| X_app[valid_index, :][:, train_index]) | |||||
| y_pred_test = kr.predict( | |||||
| X_test[:, train_index]) | |||||
| # predict on the train, validation and test set | |||||
| y_pred_train = kr.predict( | |||||
| X_app[train_index, :][:, train_index]) | |||||
| y_pred_valid = kr.predict( | |||||
| X_app[valid_index, :][:, train_index]) | |||||
| # if trial == 0: | |||||
| # print('y_pred_valid: ', y_pred_valid) | |||||
| # print() | |||||
| y_pred_test = kr.predict( | |||||
| X_test[:, train_index]) | |||||
| # root mean squared errors | |||||
| current_train_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_app[train_index], y_pred_train))) | |||||
| current_valid_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_app[valid_index], y_pred_valid))) | |||||
| current_test_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_test, y_pred_test))) | |||||
| # For clcassification use SVM | |||||
| else: | |||||
| svc = SVC(kernel='precomputed', cache_size=200, | |||||
| verbose=False, **params_in) | |||||
| # loop for each split on validation set level | |||||
| # validation set level | |||||
| for train_index, valid_index in inner_cv.split(X_app): | |||||
| # root mean squared errors | |||||
| current_train_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_app[train_index], y_pred_train))) | |||||
| current_valid_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_app[valid_index], y_pred_valid))) | |||||
| # if trial == 0: | |||||
| # print(mean_squared_error( | |||||
| # y_app[valid_index], y_pred_valid)) | |||||
| current_test_perf.append( | |||||
| np.sqrt( | |||||
| mean_squared_error( | |||||
| y_test, y_pred_test))) | |||||
| # For clcassification use SVM | |||||
| else: | |||||
| svc = SVC(kernel='precomputed', cache_size=200, | |||||
| verbose=False, **params_in) | |||||
| # loop for each split on validation set level | |||||
| # validation set level | |||||
| for train_index, valid_index in inner_cv.split(X_app): | |||||
| # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | ||||
| svc.fit(X_app[train_index, :][:, train_index], | |||||
| y_app[train_index]) | |||||
| # predict on the train, validation and test set | |||||
| y_pred_train = svc.predict( | |||||
| X_app[train_index, :][:, train_index]) | |||||
| y_pred_valid = svc.predict( | |||||
| X_app[valid_index, :][:, train_index]) | |||||
| y_pred_test = svc.predict( | |||||
| X_test[:, train_index]) | |||||
| # if trial == 0: | |||||
| # print('train_index: ', train_index) | |||||
| # print('valid_index: ', valid_index) | |||||
| # print('idx_test: ', idx_test) | |||||
| # print('y_app[train_index]: ', y_app[train_index]) | |||||
| # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
| # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
| svc.fit(X_app[train_index, :][:, train_index], | |||||
| y_app[train_index]) | |||||
| # predict on the train, validation and test set | |||||
| y_pred_train = svc.predict( | |||||
| X_app[train_index, :][:, train_index]) | |||||
| y_pred_valid = svc.predict( | |||||
| X_app[valid_index, :][:, train_index]) | |||||
| y_pred_test = svc.predict( | |||||
| X_test[:, train_index]) | |||||
| # root mean squared errors | |||||
| current_train_perf.append( | |||||
| accuracy_score(y_app[train_index], | |||||
| y_pred_train)) | |||||
| current_valid_perf.append( | |||||
| accuracy_score(y_app[valid_index], | |||||
| y_pred_valid)) | |||||
| current_test_perf.append( | |||||
| accuracy_score(y_test, y_pred_test)) | |||||
| except ValueError: | |||||
| print(sys.exc_info()[0]) | |||||
| print(params_out, params_in) | |||||
| # root mean squared errors | |||||
| current_train_perf.append( | |||||
| accuracy_score(y_app[train_index], | |||||
| y_pred_train)) | |||||
| current_valid_perf.append( | |||||
| accuracy_score(y_app[valid_index], | |||||
| y_pred_valid)) | |||||
| current_test_perf.append( | |||||
| accuracy_score(y_test, y_pred_test)) | |||||
| # except ValueError: | |||||
| # print(sys.exc_info()[0]) | |||||
| # print(params_out, params_in) | |||||
| # average performance on inner splits | # average performance on inner splits | ||||
| train_pref[index_out][index_in] = np.mean( | train_pref[index_out][index_in] = np.mean( | ||||
| @@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||||
| test_pref[index_out][index_in] = np.mean( | test_pref[index_out][index_in] = np.mean( | ||||
| current_test_perf) | current_test_perf) | ||||
| # print(time.time() - st) | # print(time.time() - st) | ||||
| # if trial == 0: | |||||
| # print('val_pref: ', val_pref) | |||||
| # print('test_pref: ', test_pref) | |||||
| return train_pref, val_pref, test_pref | return train_pref, val_pref, test_pref | ||||