| @@ -10,6 +10,7 @@ This script compares the results with and without FCSP. | |||||
| from gklearn.dataset import Dataset | from gklearn.dataset import Dataset | ||||
| from gklearn.utils import get_graph_kernel_by_name | from gklearn.utils import get_graph_kernel_by_name | ||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | ||||
| from gklearn.experiments import DATASET_ROOT | |||||
| import functools | import functools | ||||
| import os | import os | ||||
| import pickle | import pickle | ||||
| @@ -17,50 +18,77 @@ import sys | |||||
| import logging | import logging | ||||
| def run_all(fcsp): | |||||
| save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| # def run_all(fcsp): | |||||
| # from sklearn.model_selection import ParameterGrid | |||||
| # Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
| # 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
| # 'Letter-high', 'Letter-med', 'Letter-low', | |||||
| # 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
| # 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||||
| # 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||||
| # 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||||
| # 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||||
| # 'Mutagenicity', 'REDDIT-BINARY'] | |||||
| # Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
| # task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||||
| # for task in list(task_grid): | |||||
| from sklearn.model_selection import ParameterGrid | |||||
| # save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] | |||||
| # file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
| # if not os.path.isfile(file_name): | |||||
| # print() | |||||
| # print((task['kernel'], task['dataset'])) | |||||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', | |||||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
| 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||||
| 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||||
| 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||||
| 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||||
| 'Mutagenicity', 'REDDIT-BINARY'] | |||||
| # try: | |||||
| # gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) | |||||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
| # except Exception as exp: | |||||
| # print('An exception occured when running this experiment:') | |||||
| # LOG_FILENAME = save_dir + 'error.txt' | |||||
| # logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| # logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
| # print(repr(exp)) | |||||
| # else: | |||||
| # save_file_suffix = '.' + task['kernel'] + task['dataset'] | |||||
| work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||||
| # with open(file_name, 'wb') as f: | |||||
| # pickle.dump(run_time, f) | |||||
| for work in list(work_grid): | |||||
| save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] | |||||
| file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
| if not os.path.isfile(file_name): | |||||
| print() | |||||
| print((work['kernel'], work['dataset'])) | |||||
| try: | |||||
| gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception(save_file_suffix) | |||||
| print(repr(exp)) | |||||
| def run_task(kernel_name, ds_name, fcsp): | |||||
| save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||||
| file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
| save_file_suffix = '.' + work['kernel'] + work['dataset'] | |||||
| if not os.path.isfile(file_name): | |||||
| print() | |||||
| print((kernel_name, ds_name, str(fcsp))) | |||||
| try: | |||||
| gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
| print(repr(exp)) | |||||
| else: | |||||
| with open(file_name, 'wb') as f: | with open(file_name, 'wb') as f: | ||||
| pickle.dump(run_time, f) | pickle.dump(run_time, f) | ||||
| def run_work(kernel_name, ds_name, fcsp): | |||||
| dataset = Dataset(ds_name, verbose=True) | |||||
| def compute(kernel_name, ds_name, fcsp): | |||||
| dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||||
| if kernel_name == 'ShortestPath': | |||||
| dataset.trim_dataset(edge_required=True) | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
| node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | ||||
| @@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| if len(sys.argv) > 1: | if len(sys.argv) > 1: | ||||
| fcsp = True if sys.argv[1] == 'True' else False | |||||
| kernel_name = sys.argv[1] | |||||
| ds_name = sys.argv[2] | |||||
| fcsp = True if sys.argv[3] == 'True' else False | |||||
| else: | else: | ||||
| kernel_name = 'ShortestPath' | |||||
| ds_name = 'Acyclic' | |||||
| fcsp = True | fcsp = True | ||||
| run_all(fcsp) | |||||
| save_dir = 'outputs/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| run_task(kernel_name, ds_name, fcsp) | |||||
| @@ -0,0 +1,98 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Wed Dec 2 17:41:54 2020 | |||||
| @author: ljia | |||||
| This script compares the results with and without FCSP. | |||||
| """ | |||||
| from gklearn.dataset import Dataset | |||||
| from shortest_path import SPSpace | |||||
| from structural_sp import SSPSpace | |||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| from gklearn.experiments import DATASET_ROOT | |||||
| import functools | |||||
| import os | |||||
| import pickle | |||||
| import sys | |||||
| import logging | |||||
| def run_task(kernel_name, ds_name, fcsp): | |||||
| save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||||
| file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') | |||||
| # Return if the task is already completed. | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| data = pickle.load(f) | |||||
| if data['completed']: | |||||
| return | |||||
| print() | |||||
| print((kernel_name, ds_name, str(fcsp))) | |||||
| try: | |||||
| gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
| print(repr(exp)) | |||||
| # else: | |||||
| # with open(file_name, 'wb') as f: | |||||
| # pickle.dump(run_time, f) | |||||
| def compute(kernel_name, ds_name, fcsp, file_name): | |||||
| dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||||
| if kernel_name == 'ShortestPath': | |||||
| dataset.trim_dataset(edge_required=True) | |||||
| # dataset.cut_graphs(range(0, 10)) | |||||
| kernel_class = SPSpace | |||||
| else: | |||||
| # dataset.cut_graphs(range(0, 10)) | |||||
| kernel_class = SSPSpace | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| graph_kernel = kernel_class(name=kernel_name, | |||||
| node_labels=dataset.node_labels, | |||||
| edge_labels=dataset.edge_labels, | |||||
| node_attrs=dataset.node_attrs, | |||||
| edge_attrs=dataset.edge_attrs, | |||||
| ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||||
| fcsp=fcsp, | |||||
| compute_method='naive', | |||||
| node_kernels=node_kernels, | |||||
| edge_kernels=edge_kernels, | |||||
| file_name=file_name | |||||
| ) | |||||
| gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||||
| parallel=None, | |||||
| normalize=False, | |||||
| verbose=2 | |||||
| ) | |||||
| return gram_matrix, run_time | |||||
| if __name__ == '__main__': | |||||
| if len(sys.argv) > 1: | |||||
| kernel_name = sys.argv[1] | |||||
| ds_name = sys.argv[2] | |||||
| fcsp = True if sys.argv[3] == 'True' else False | |||||
| else: | |||||
| kernel_name = 'StructuralSP' | |||||
| ds_name = 'Fingerprint' | |||||
| fcsp = True | |||||
| save_dir = 'outputs/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| run_task(kernel_name, ds_name, fcsp) | |||||
| @@ -10,27 +10,60 @@ import os | |||||
| import re | import re | ||||
| def get_job_script(param): | |||||
| OUT_TIME_LIST = [('ShortestPath', 'ENZYMES', 'False'), | |||||
| ('StructuralSP', 'ENZYMES', 'True'), | |||||
| ('StructuralSP', 'ENZYMES', 'False'), | |||||
| ('StructuralSP', 'AIDS', 'False'), | |||||
| ('ShortestPath', 'NCI1', 'False'), | |||||
| ('StructuralSP', 'NCI1', 'True'), | |||||
| ('StructuralSP', 'NCI1', 'False'), | |||||
| ('ShortestPath', 'NCI109', 'False'), | |||||
| ('StructuralSP', 'NCI109', 'True'), | |||||
| ('StructuralSP', 'NCI109', 'False'), | |||||
| ('ShortestPath', 'DD', 'True'), | |||||
| ('ShortestPath', 'DD', 'False'), | |||||
| ('StructuralSP', 'BZR', 'False'), | |||||
| ('ShortestPath', 'COX2', 'False'), | |||||
| ('StructuralSP', 'COX2', 'False'), | |||||
| ('ShortestPath', 'DHFR', 'False'), | |||||
| ] | |||||
| OUT_MEM_LIST = [('StructuralSP', 'PROTEINS', 'True'), | |||||
| ('StructuralSP', 'PROTEINS', 'False'), | |||||
| ('StructuralSP', 'PROTEINS_full', 'True'), | |||||
| ('StructuralSP', 'PROTEINS_full', 'False'), | |||||
| ('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
| ] | |||||
| MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||||
| ('StructuralSP', 'GREC', 'False'), | |||||
| ('StructuralSP', 'Web', 'True'), | |||||
| ('StructuralSP', 'Web', 'False'), | |||||
| ] | |||||
| def get_job_script(kernel, dataset, fcsp): | |||||
| script = r""" | script = r""" | ||||
| #!/bin/bash | #!/bin/bash | ||||
| #SBATCH --exclusive | #SBATCH --exclusive | ||||
| #SBATCH --job-name="fcsp.""" + param + r"""" | |||||
| #SBATCH --partition=long | |||||
| #SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||||
| #SBATCH --partition=tlong | |||||
| #SBATCH --mail-type=ALL | #SBATCH --mail-type=ALL | ||||
| #SBATCH --mail-user=jajupmochi@gmail.com | #SBATCH --mail-user=jajupmochi@gmail.com | ||||
| #SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" | |||||
| #SBATCH --error="errors/error_fcsp.""" + param + r""".txt" | |||||
| #SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
| #SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
| # | # | ||||
| #SBATCH --ntasks=1 | #SBATCH --ntasks=1 | ||||
| #SBATCH --nodes=1 | #SBATCH --nodes=1 | ||||
| #SBATCH --cpus-per-task=1 | #SBATCH --cpus-per-task=1 | ||||
| #SBATCH --time=100:00:00 | |||||
| #SBATCH --mem-per-cpu=4000 | |||||
| #SBATCH --time=300:00:00 | |||||
| ##SBATCH --mem-per-cpu=4000 | |||||
| #SBATCH --mem=40000 | |||||
| srun hostname | srun hostname | ||||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | ||||
| srun python3 compare_fcsp.py """ + param | |||||
| srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp | |||||
| script = script.strip() | script = script.strip() | ||||
| script = re.sub('\n\t+', '\n', script) | script = re.sub('\n\t+', '\n', script) | ||||
| script = re.sub('\n +', '\n', script) | script = re.sub('\n +', '\n', script) | ||||
| @@ -38,15 +71,75 @@ srun python3 compare_fcsp.py """ + param | |||||
| return script | return script | ||||
| def check_task_status(save_dir, *params): | |||||
| str_task_id = '.' + '.'.join(params) | |||||
| # Check if the task is in out of memeory or out of space lists or missing labels. | |||||
| if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||||
| return True | |||||
| # Check if the task is running or in queue of slurm. | |||||
| command = 'squeue --user ljia02 --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' | |||||
| stream = os.popen(command) | |||||
| output = stream.readlines() | |||||
| if len(output) > 0: | |||||
| return True | |||||
| # Check if the results are already computed. | |||||
| file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') | |||||
| if os.path.isfile(file_name): | |||||
| return True | |||||
| return False | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| save_dir = 'outputs/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs('outputs/', exist_ok=True) | os.makedirs('outputs/', exist_ok=True) | ||||
| os.makedirs('errors/', exist_ok=True) | os.makedirs('errors/', exist_ok=True) | ||||
| param_list = ['True', 'False'] | |||||
| for param in param_list[:]: | |||||
| job_script = get_job_script(param) | |||||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
| # print(command) | |||||
| os.system(command) | |||||
| from sklearn.model_selection import ParameterGrid | |||||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
| 'Letter-high', 'Letter-med', 'Letter-low', | |||||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
| # new: not so large. | |||||
| 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||||
| 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||||
| 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||||
| 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||||
| # new: large. | |||||
| 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||||
| 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||||
| 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||||
| 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||||
| 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||||
| 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||||
| 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||||
| 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||||
| 'COLLAB', 'COIL-DEL', | |||||
| 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||||
| 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||||
| 'REDDIT-MULTI-12K'] | |||||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
| fcsp_list = ['True', 'False'] | |||||
| task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||||
| 'dataset': Dataset_List[:], | |||||
| 'fcsp': fcsp_list[:]}) | |||||
| from tqdm import tqdm | |||||
| for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||||
| if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||||
| job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
| # print(command) | |||||
| os.system(command) | |||||
| # os.popen(command) | # os.popen(command) | ||||
| # output = stream.readlines() | # output = stream.readlines() | ||||
| @@ -0,0 +1,225 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Dec 14 11:49:43 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| import os | |||||
| import re | |||||
| import pickle | |||||
| OUT_TIME_LIST = [] | |||||
| OUT_MEM_LIST = [('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
| ('ShortestPath', 'REDDIT-BINARY', 'False'), | |||||
| ('ShortestPath', 'DD', 'True'), | |||||
| ('ShortestPath', 'DD', 'False'), | |||||
| ('ShortestPath', 'MCF-7', 'True'), | |||||
| ('ShortestPath', 'MCF-7', 'False'), | |||||
| ('StructuralSP', 'MCF-7', 'True'), | |||||
| ('StructuralSP', 'MCF-7', 'False'), | |||||
| ('ShortestPath', 'MCF-7H', 'True'), | |||||
| ('ShortestPath', 'MCF-7H', 'False'), | |||||
| ('StructuralSP', 'MCF-7H', 'True'), | |||||
| ('StructuralSP', 'MCF-7H', 'False'), | |||||
| ('ShortestPath', 'MOLT-4', 'True'), | |||||
| ('ShortestPath', 'MOLT-4', 'False'), | |||||
| ('StructuralSP', 'MOLT-4', 'True'), | |||||
| ('StructuralSP', 'MOLT-4', 'False'), | |||||
| ('ShortestPath', 'MOLT-4H', 'True'), | |||||
| ('ShortestPath', 'MOLT-4H', 'False'), | |||||
| ('StructuralSP', 'MOLT-4H', 'True'), | |||||
| ('StructuralSP', 'MOLT-4H', 'False'), | |||||
| ('ShortestPath', 'P388', 'True'), | |||||
| ('ShortestPath', 'P388H', 'True'), | |||||
| ('ShortestPath', 'NCI-H23', 'True'), | |||||
| ('ShortestPath', 'NCI-H23', 'False'), | |||||
| ('StructuralSP', 'NCI-H23', 'True'), | |||||
| ('StructuralSP', 'NCI-H23', 'False'), | |||||
| ('ShortestPath', 'NCI-H23H', 'True'), | |||||
| ('ShortestPath', 'NCI-H23H', 'False'), | |||||
| ('StructuralSP', 'NCI-H23H', 'True'), | |||||
| ('StructuralSP', 'NCI-H23H', 'False'), | |||||
| ('ShortestPath', 'OVCAR-8', 'True'), | |||||
| ('ShortestPath', 'OVCAR-8', 'False'), | |||||
| ('StructuralSP', 'OVCAR-8', 'True'), | |||||
| ('StructuralSP', 'OVCAR-8', 'False'), | |||||
| ('ShortestPath', 'OVCAR-8H', 'False'), | |||||
| ('StructuralSP', 'OVCAR-8H', 'False'), | |||||
| ('ShortestPath', 'SN12C', 'True'), | |||||
| ('ShortestPath', 'SN12C', 'False'), | |||||
| ('StructuralSP', 'SN12C', 'True'), | |||||
| ('StructuralSP', 'SN12C', 'False'), | |||||
| ('ShortestPath', 'SN12CH', 'True'), | |||||
| ('ShortestPath', 'SN12CH', 'False'), | |||||
| ('ShortestPath', 'SF-295', 'True'), | |||||
| ('ShortestPath', 'SF-295', 'False'), | |||||
| ('StructuralSP', 'SF-295', 'True'), | |||||
| ('StructuralSP', 'SF-295', 'False'), | |||||
| ('ShortestPath', 'SF-295H', 'False'), | |||||
| ('StructuralSP', 'SF-295H', 'False'), | |||||
| ('ShortestPath', 'SW-620', 'True'), | |||||
| ('ShortestPath', 'SW-620', 'False'), | |||||
| ('StructuralSP', 'SW-620', 'True'), | |||||
| ('StructuralSP', 'SW-620', 'False'), | |||||
| ('ShortestPath', 'SW-620H', 'False'), | |||||
| ('StructuralSP', 'SW-620H', 'False'), | |||||
| ('ShortestPath', 'TRIANGLES', 'False'), | |||||
| ('StructuralSP', 'TRIANGLES', 'False'), | |||||
| ('ShortestPath', 'Yeast', 'True'), | |||||
| ('ShortestPath', 'Yeast', 'False'), | |||||
| ('StructuralSP', 'Yeast', 'True'), | |||||
| ('StructuralSP', 'Yeast', 'False'), | |||||
| ('ShortestPath', 'YeastH', 'True'), | |||||
| ('ShortestPath', 'FRANKENSTEIN', 'True'), | |||||
| ('ShortestPath', 'FRANKENSTEIN', 'False'), | |||||
| ('StructuralSP', 'FRANKENSTEIN', 'True'), | |||||
| ('StructuralSP', 'FRANKENSTEIN', 'False'), | |||||
| ('StructuralSP', 'SN12CH', 'True'), | |||||
| ('StructuralSP', 'SN12CH', 'False'), | |||||
| ('ShortestPath', 'UACC257', 'True'), | |||||
| ('ShortestPath', 'UACC257', 'False'), | |||||
| ('StructuralSP', 'UACC257', 'True'), | |||||
| ('StructuralSP', 'UACC257', 'False'), | |||||
| ('ShortestPath', 'UACC257H', 'True'), | |||||
| ('ShortestPath', 'UACC257H', 'False'), | |||||
| ('StructuralSP', 'UACC257H', 'True'), | |||||
| ('StructuralSP', 'UACC257H', 'False'), | |||||
| ('ShortestPath', 'PC-3', 'True'), | |||||
| ('ShortestPath', 'PC-3', 'False'), | |||||
| ('StructuralSP', 'PC-3', 'True'), | |||||
| ('StructuralSP', 'PC-3', 'False'), | |||||
| ('ShortestPath', 'PC-3H', 'True'), | |||||
| ('ShortestPath', 'PC-3H', 'False'), | |||||
| ('StructuralSP', 'PC-3H', 'True'), | |||||
| ('StructuralSP', 'PC-3H', 'False'), | |||||
| ('ShortestPath', 'DBLP_v1', 'False'), | |||||
| ('StructuralSP', 'DBLP_v1', 'True'), | |||||
| ('ShortestPath', 'REDDIT-BINARY', 'False'), | |||||
| ('ShortestPath', 'REDDIT-MULTI-12K', 'False'), | |||||
| ('StructuralSP', 'REDDIT-MULTI-12K', 'False'), | |||||
| ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||||
| ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||||
| ('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||||
| ('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'), | |||||
| ] | |||||
| MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||||
| ('StructuralSP', 'GREC', 'False'), | |||||
| ('StructuralSP', 'Web', 'True'), | |||||
| ('StructuralSP', 'Web', 'False'), | |||||
| ] | |||||
| def get_job_script(kernel, dataset, fcsp): | |||||
| # if (kernel, dataset, fcsp) in OUT_MEM_LIST: | |||||
| # mem = '2560000' | |||||
| # else: | |||||
| mem = '4000' | |||||
| script = r""" | |||||
| #!/bin/bash | |||||
| #SBATCH --exclusive | |||||
| #SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||||
| #SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r""" | |||||
| #SBATCH --mail-type=ALL | |||||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||||
| #SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
| #SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
| # | |||||
| #SBATCH --ntasks=1 | |||||
| #SBATCH --nodes=1 | |||||
| #SBATCH --cpus-per-task=1 | |||||
| #SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00 | |||||
| ##SBATCH --mem-per-cpu=""" + mem + r""" | |||||
| #SBATCH --mem=4000 | |||||
| srun hostname | |||||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||||
| srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp | |||||
| script = script.strip() | |||||
| script = re.sub('\n\t+', '\n', script) | |||||
| script = re.sub('\n +', '\n', script) | |||||
| return script | |||||
| def check_task_status(save_dir, *params): | |||||
| str_task_id = '.' + '.'.join(params) | |||||
| # Check if the task is in out of memeory or out of space lists or missing labels. | |||||
| if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||||
| return True | |||||
| # Check if the task is running or in queue of slurm. | |||||
| command = 'squeue --user ljia02 --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader' | |||||
| stream = os.popen(command) | |||||
| output = stream.readlines() | |||||
| if len(output) > 0: | |||||
| return True | |||||
| # Check if the task is already computed. | |||||
| file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| data = pickle.load(f) | |||||
| if data['completed']: | |||||
| return True | |||||
| return False | |||||
| if __name__ == '__main__': | |||||
| save_dir = 'outputs/' | |||||
| os.makedirs(save_dir, exist_ok=True) | |||||
| os.makedirs('outputs/', exist_ok=True) | |||||
| os.makedirs('errors/', exist_ok=True) | |||||
| from sklearn.model_selection import ParameterGrid | |||||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
| 'Letter-high', 'Letter-med', 'Letter-low', | |||||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
| # new: not so large. | |||||
| 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||||
| 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||||
| 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||||
| 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||||
| # new: large. | |||||
| 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||||
| 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||||
| 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||||
| 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||||
| 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||||
| 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||||
| 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||||
| 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||||
| 'COLLAB', 'COIL-DEL', | |||||
| 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||||
| 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||||
| 'REDDIT-MULTI-12K'] | |||||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
| fcsp_list = ['True', 'False'] | |||||
| task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||||
| 'dataset': Dataset_List[:], | |||||
| 'fcsp': fcsp_list[:]}) | |||||
| from tqdm import tqdm | |||||
| for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||||
| if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||||
| job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
| # print(command) | |||||
| os.system(command) | |||||
| # os.popen(command) | |||||
| # output = stream.readlines() | |||||
| @@ -0,0 +1,253 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Tue Apr 7 15:24:58 2020 | |||||
| @author: ljia | |||||
| @references: | |||||
| [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||||
| Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||||
| """ | |||||
| import sys | |||||
| from itertools import product | |||||
| # from functools import partial | |||||
| from gklearn.utils import get_iters | |||||
| import numpy as np | |||||
| from gklearn.utils.utils import getSPGraph | |||||
| from gklearn.kernels import ShortestPath | |||||
| import os | |||||
| import pickle | |||||
| from pympler import asizeof | |||||
| import time | |||||
| import networkx as nx | |||||
| def load_results(file_name, fcsp): | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| return pickle.load(f) | |||||
| else: | |||||
| results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||||
| if fcsp: | |||||
| results['vk_dict_mem'] = [] | |||||
| return results | |||||
| def save_results(file_name, results): | |||||
| with open(file_name, 'wb') as f: | |||||
| pickle.dump(results, f) | |||||
| def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||||
| # asizeof.asized(obj, detail=1).format() | |||||
| # return asizeof.asizeof(obj) | |||||
| key, val = next(iter(obj.items())) | |||||
| # key = dict.iterkeys().next() | |||||
| # key_mem = asizeof.asizeof(key) | |||||
| dict_flat = sys.getsizeof(obj) | |||||
| key_mem = 64 | |||||
| if isinstance(val, float): | |||||
| val_mem = 24 | |||||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
| else: # value is True or False | |||||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
| return mem | |||||
| def compute_stats(file_name, results): | |||||
| del results['i'] | |||||
| del results['j'] | |||||
| results['nb_comparison'] = np.mean(results['nb_comparison']) | |||||
| results['completed'] = True | |||||
| if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||||
| results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||||
| save_results(file_name, results) | |||||
| class SPSpace(ShortestPath): | |||||
| def __init__(self, **kwargs): | |||||
| super().__init__(**kwargs) | |||||
| self._file_name = kwargs.get('file_name') | |||||
| # @profile | |||||
| def _compute_gm_series(self): | |||||
| self._all_graphs_have_edges(self._graphs) | |||||
| # get shortest path graph of each graph. | |||||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
| self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||||
| results = load_results(self._file_name, self._fcsp) | |||||
| # compute Gram matrix. | |||||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
| from itertools import combinations_with_replacement | |||||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||||
| iterator = get_iters(itr, desc='Computing kernels', | |||||
| length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||||
| time0 = time.time() | |||||
| for i, j in iterator: | |||||
| if i > results['i'] or (i == results['i'] and j > results['j']): | |||||
| data = self._sp_do_space(self._graphs[i], self._graphs[j]) | |||||
| if self._fcsp: | |||||
| results['nb_comparison'].append(data[0]) | |||||
| if data[1] != {}: | |||||
| results['vk_dict_mem'].append(estimate_vk_memory(data[1], | |||||
| nx.number_of_nodes(self._graphs[i]), | |||||
| nx.number_of_nodes(self._graphs[j]))) | |||||
| else: | |||||
| results['nb_comparison'].append(data) | |||||
| results['i'] = i | |||||
| results['j'] = j | |||||
| time1 = time.time() | |||||
| if time1 - time0 > 600: | |||||
| save_results(self._file_name, results) | |||||
| time0 = time1 | |||||
| compute_stats(self._file_name, results) | |||||
| return gram_matrix | |||||
| def _sp_do_space(self, g1, g2): | |||||
| if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||||
| return self._sp_do_fcsp(g1, g2) | |||||
| else: | |||||
| return self._sp_do_naive(g1, g2) | |||||
| def _sp_do_fcsp(self, g1, g2): | |||||
| nb_comparison = 0 | |||||
| # compute shortest path matrices first, method borrowed from FCSP. | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). | |||||
| # node symb and non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| kn = self._node_kernels['mix'] | |||||
| for n1, n2 in product( | |||||
| g1.nodes(data=True), g2.nodes(data=True)): | |||||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| nb_comparison += 1 | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = self._node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| nb_comparison += 1 | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| kn = self._node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| nb_comparison += 1 | |||||
| # node unlabeled | |||||
| else: | |||||
| for e1, e2 in product( | |||||
| g1.edges(data=True), g2.edges(data=True)): | |||||
| pass | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # nb_comparison += 1 | |||||
| return nb_comparison, vk_dict | |||||
| # # compute graph kernels | |||||
| # if self._ds_infos['directed']: | |||||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] | |||||
| # kn1 = nk11 * nk22 | |||||
| # kernel += kn1 | |||||
| # else: | |||||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||||
| # nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||||
| # e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] | |||||
| # kn1 = nk11 * nk22 | |||||
| # kn2 = nk12 * nk21 | |||||
| # kernel += kn1 + kn2 | |||||
| def _sp_do_naive(self, g1, g2): | |||||
| nb_comparison = 0 | |||||
| # Define the function to compute kernels between vertices in each condition. | |||||
| if len(self._node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['mix'] | |||||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
| return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['symb'] | |||||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
| return kn(n1_labels, n2_labels) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['nsymb'] | |||||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
| return kn(n1_attrs, n2_attrs) | |||||
| # node unlabeled | |||||
| else: | |||||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| return 0 | |||||
| # compute graph kernels | |||||
| if self._ds_infos['directed']: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| # nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) | |||||
| # kn1 = nk11 * nk22 | |||||
| # kernel += kn1 | |||||
| nb_comparison += 2 | |||||
| else: | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| if e1[2]['cost'] == e2[2]['cost']: | |||||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||||
| # nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( | |||||
| # e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) | |||||
| # kn1 = nk11 * nk22 | |||||
| # kn2 = nk12 * nk21 | |||||
| # kernel += kn1 + kn2 | |||||
| nb_comparison += 4 | |||||
| return nb_comparison | |||||
| @@ -0,0 +1,439 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Mar 30 11:59:57 2020 | |||||
| @author: ljia | |||||
| @references: | |||||
| [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For | |||||
| Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | |||||
| """ | |||||
| import sys | |||||
| from itertools import product | |||||
| from gklearn.utils import get_iters | |||||
| import numpy as np | |||||
| import time | |||||
| import os, errno | |||||
| import pickle | |||||
| from pympler import asizeof | |||||
| import networkx as nx | |||||
| from gklearn.utils.utils import get_shortest_paths | |||||
| from gklearn.kernels import StructuralSP | |||||
| def load_splist(file_name): | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| return pickle.load(f) | |||||
| else: | |||||
| results_path = {'splist': [], 'i': -1, 'completed': False} | |||||
| return results_path | |||||
| def load_results(file_name, fcsp): | |||||
| if os.path.isfile(file_name): | |||||
| with open(file_name, 'rb') as f: | |||||
| return pickle.load(f) | |||||
| else: | |||||
| results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||||
| if fcsp: | |||||
| results['vk_dict_mem'] = [] | |||||
| results['ek_dict_mem'] = [] | |||||
| return results | |||||
| def save_results(file_name, results): | |||||
| with open(file_name, 'wb') as f: | |||||
| pickle.dump(results, f) | |||||
| def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||||
| # asizeof.asized(obj, detail=1).format() | |||||
| # return asizeof.asizeof(obj) | |||||
| key, val = next(iter(obj.items())) | |||||
| # key = dict.iterkeys().next() | |||||
| # key_mem = asizeof.asizeof(key) | |||||
| dict_flat = sys.getsizeof(obj) | |||||
| key_mem = 64 | |||||
| if isinstance(val, float): | |||||
| val_mem = 24 | |||||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
| else: # value is True or False | |||||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
| return mem | |||||
| def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): | |||||
| # asizeof.asized(obj, detail=1).format() | |||||
| # return asizeof.asizeof(obj) | |||||
| key, val = next(iter(obj.items())) | |||||
| # key = dict.iterkeys().next() | |||||
| # key_mem = asizeof.asizeof(key) | |||||
| dict_flat = sys.getsizeof(obj) | |||||
| key_mem = 192 | |||||
| if isinstance(val, float): | |||||
| val_mem = 24 | |||||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
| else: # value is True or False | |||||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
| return mem | |||||
| def compute_stats(file_name, results, splist): | |||||
| del results['i'] | |||||
| del results['j'] | |||||
| results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) | |||||
| # if len(results['nb_e_comparison']) > 0: | |||||
| results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) | |||||
| results['completed'] = True | |||||
| if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||||
| results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||||
| if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: | |||||
| results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) | |||||
| results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) | |||||
| results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) | |||||
| results['sp_mem_all'] = asizeof.asizeof(splist) | |||||
| save_results(file_name, results) | |||||
| class SSPSpace(StructuralSP): | |||||
| def __init__(self, **kwargs): | |||||
| super().__init__(**kwargs) | |||||
| self._file_name = kwargs.get('file_name') | |||||
| # @profile | |||||
| def _compute_gm_series(self): | |||||
| # get shortest paths of each graph in the graphs. | |||||
| fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' | |||||
| results_path = load_splist(fn_paths) | |||||
| if not results_path['completed']: | |||||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
| if self._compute_method == 'trie': | |||||
| for g in iterator: | |||||
| splist.append(self._get_sps_as_trie(g)) | |||||
| else: | |||||
| time0 = time.time() | |||||
| for i, g in enumerate(iterator): | |||||
| if i > results_path['i']: | |||||
| results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) | |||||
| results_path['i'] = i | |||||
| time1 = time.time() | |||||
| if time1 - time0 > 600: | |||||
| save_results(fn_paths, results_path) | |||||
| time0 = time1 | |||||
| del results_path['i'] | |||||
| results_path['completed'] = True | |||||
| save_results(fn_paths, results_path) | |||||
| ######### | |||||
| splist = results_path['splist'] | |||||
| results = load_results(self._file_name, self._fcsp) | |||||
| # compute Gram matrix. | |||||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
| from itertools import combinations_with_replacement | |||||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||||
| length=len_itr, verbose=(self._verbose >= 2)) | |||||
| if self._compute_method == 'trie': | |||||
| for i, j in iterator: | |||||
| kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||||
| gram_matrix[i][j] = kernel | |||||
| gram_matrix[j][i] = kernel | |||||
| else: | |||||
| time0 = time.time() | |||||
| for i, j in iterator: | |||||
| if i > results['i'] or (i == results['i'] and j > results['j']): | |||||
| data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||||
| results['nb_v_comparison'].append(data[0]) | |||||
| results['nb_e_comparison'].append(data[1]) | |||||
| if self._fcsp: | |||||
| if data[2] != {}: | |||||
| results['vk_dict_mem'].append(estimate_vk_memory(data[2], | |||||
| nx.number_of_nodes(self._graphs[i]), | |||||
| nx.number_of_nodes(self._graphs[j]))) | |||||
| if data[3] != {}: | |||||
| results['ek_dict_mem'].append(estimate_ek_memory(data[3], | |||||
| nx.number_of_nodes(self._graphs[i]), | |||||
| nx.number_of_nodes(self._graphs[j]))) | |||||
| results['i'] = i | |||||
| results['j'] = j | |||||
| time1 = time.time() | |||||
| if time1 - time0 > 600: | |||||
| save_results(self._file_name, results) | |||||
| time0 = time1 | |||||
| compute_stats(self._file_name, results, splist) | |||||
| # @todo: may not remove the path file if the program stops exactly here. | |||||
| try: | |||||
| os.remove(fn_paths) | |||||
| except OSError as e: | |||||
| if e.errno != errno.ENOENT: | |||||
| raise | |||||
| return gram_matrix | |||||
| def _ssp_do_naive_space(self, g1, g2, spl1, spl2): | |||||
| if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||||
| return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) | |||||
| else: | |||||
| return self._sp_do_naive_naive(g1, g2, spl1, spl2) | |||||
| def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): | |||||
| # First, compute shortest path matrices, method borrowed from FCSP. | |||||
| vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) | |||||
| # Then, compute kernels between all pairs of edges, which is an idea of | |||||
| # extension of FCSP. It suits sparse graphs, which is the most case we | |||||
| # went though. For dense graphs, this would be slow. | |||||
| ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) | |||||
| return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict | |||||
| def _sp_do_naive_naive(self, g1, g2, spl1, spl2): | |||||
| nb_v_comparison = 0 | |||||
| nb_e_comparison = 0 | |||||
| # Define the function to compute kernels between vertices in each condition. | |||||
| if len(self._node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['mix'] | |||||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
| return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| # node symb labeled | |||||
| else: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['symb'] | |||||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
| return kn(n1_labels, n2_labels) | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| def compute_vk(n1, n2): | |||||
| kn = self._node_kernels['nsymb'] | |||||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
| return kn(n1_attrs, n2_attrs) | |||||
| # # node unlabeled | |||||
| # else: | |||||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| # Define the function to compute kernels between edges in each condition. | |||||
| if len(self._edge_labels) > 0: | |||||
| # edge symb and non-synb labeled | |||||
| if len(self._edge_attrs) > 0: | |||||
| def compute_ek(e1, e2): | |||||
| ke = self._edge_kernels['mix'] | |||||
| e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||||
| e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||||
| e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||||
| e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||||
| return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||||
| # edge symb labeled | |||||
| else: | |||||
| def compute_ek(e1, e2): | |||||
| ke = self._edge_kernels['symb'] | |||||
| e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||||
| e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||||
| return ke(e1_labels, e2_labels) | |||||
| else: | |||||
| # edge non-synb labeled | |||||
| if len(self._edge_attrs) > 0: | |||||
| def compute_ek(e1, e2): | |||||
| ke = self._edge_kernels['nsymb'] | |||||
| e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||||
| e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||||
| return ke(e1_attrs, e2_attrs) | |||||
| # compute graph kernels | |||||
| if len(self._node_labels) > 0 or len(self._node_attrs) > 0: | |||||
| if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||||
| for p1, p2 in product(spl1, spl2): | |||||
| if len(p1) == len(p2): | |||||
| # nb_v_comparison = len(p1) | |||||
| # nb_e_comparison = len(p1) - 1 | |||||
| kpath = compute_vk(p1[0], p2[0]) | |||||
| nb_v_comparison += 1 | |||||
| if kpath: | |||||
| for idx in range(1, len(p1)): | |||||
| kpath *= compute_vk(p1[idx], p2[idx]) * \ | |||||
| compute_ek((p1[idx-1], p1[idx]), | |||||
| (p2[idx-1], p2[idx])) | |||||
| nb_v_comparison += 1 | |||||
| nb_e_comparison += 1 | |||||
| if not kpath: | |||||
| break | |||||
| # kernel += kpath # add up kernels of all paths | |||||
| else: | |||||
| for p1, p2 in product(spl1, spl2): | |||||
| if len(p1) == len(p2): | |||||
| kpath = compute_vk(p1[0], p2[0]) | |||||
| nb_v_comparison += 1 | |||||
| if kpath: | |||||
| for idx in range(1, len(p1)): | |||||
| kpath *= compute_vk(p1[idx], p2[idx]) | |||||
| nb_v_comparison += 1 | |||||
| if not kpath: | |||||
| break | |||||
| # kernel += kpath # add up kernels of all paths | |||||
| else: | |||||
| if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||||
| for p1, p2 in product(spl1, spl2): | |||||
| if len(p1) == len(p2): | |||||
| if len(p1) == 0: | |||||
| pass | |||||
| else: | |||||
| kpath = 1 | |||||
| for idx in range(0, len(p1) - 1): | |||||
| kpath *= compute_ek((p1[idx], p1[idx+1]), | |||||
| (p2[idx], p2[idx+1])) | |||||
| nb_e_comparison += 1 | |||||
| if not kpath: | |||||
| break | |||||
| else: | |||||
| pass | |||||
| # for p1, p2 in product(spl1, spl2): | |||||
| # if len(p1) == len(p2): | |||||
| # kernel += 1 | |||||
| # try: | |||||
| # kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average | |||||
| # except ZeroDivisionError: | |||||
| # print(spl1, spl2) | |||||
| # print(g1.nodes(data=True)) | |||||
| # print(g1.edges(data=True)) | |||||
| # raise Exception | |||||
| return nb_v_comparison, nb_e_comparison | |||||
| def _get_all_node_kernels(self, g1, g2): | |||||
| nb_comparison = 0 | |||||
| vk_dict = {} # shortest path matrices dict | |||||
| if len(self._node_labels) > 0: | |||||
| # node symb and non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| kn = self._node_kernels['mix'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
| nb_comparison += 1 | |||||
| # node symb labeled | |||||
| else: | |||||
| kn = self._node_kernels['symb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
| nb_comparison += 1 | |||||
| else: | |||||
| # node non-synb labeled | |||||
| if len(self._node_attrs) > 0: | |||||
| kn = self._node_kernels['nsymb'] | |||||
| for n1 in g1.nodes(data=True): | |||||
| for n2 in g2.nodes(data=True): | |||||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
| nb_comparison += 1 | |||||
| # node unlabeled | |||||
| else: | |||||
| pass # @todo: add edge weights. | |||||
| # for e1 in g1.edges(data=True): | |||||
| # for e2 in g2.edges(data=True): | |||||
| # if e1[2]['cost'] == e2[2]['cost']: | |||||
| # kernel += 1 | |||||
| # return kernel | |||||
| return vk_dict, nb_comparison | |||||
| def _get_all_edge_kernels(self, g1, g2): | |||||
| nb_comparison = 0 | |||||
| # compute kernels between all pairs of edges, which is an idea of | |||||
| # extension of FCSP. It suits sparse graphs, which is the most case we | |||||
| # went though. For dense graphs, this would be slow. | |||||
| ek_dict = {} # dict of edge kernels | |||||
| if len(self._edge_labels) > 0: | |||||
| # edge symb and non-synb labeled | |||||
| if len(self._edge_attrs) > 0: | |||||
| ke = self._edge_kernels['mix'] | |||||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
| e1_labels = [e1[2][el] for el in self._edge_labels] | |||||
| e2_labels = [e2[2][el] for el in self._edge_labels] | |||||
| e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||||
| e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||||
| ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
| nb_comparison += 1 | |||||
| # edge symb labeled | |||||
| else: | |||||
| ke = self._edge_kernels['symb'] | |||||
| for e1 in g1.edges(data=True): | |||||
| for e2 in g2.edges(data=True): | |||||
| e1_labels = [e1[2][el] for el in self._edge_labels] | |||||
| e2_labels = [e2[2][el] for el in self._edge_labels] | |||||
| ek_temp = ke(e1_labels, e2_labels) | |||||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
| nb_comparison += 1 | |||||
| else: | |||||
| # edge non-synb labeled | |||||
| if len(self._edge_attrs) > 0: | |||||
| ke = self._edge_kernels['nsymb'] | |||||
| for e1 in g1.edges(data=True): | |||||
| for e2 in g2.edges(data=True): | |||||
| e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||||
| e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||||
| ek_temp = ke(e1_attrs, e2_attrs) | |||||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
| nb_comparison += 1 | |||||
| # edge unlabeled | |||||
| else: | |||||
| pass | |||||
| return ek_dict, nb_comparison | |||||