| @@ -10,6 +10,7 @@ This script compares the results with and without FCSP. | |||
| from gklearn.dataset import Dataset | |||
| from gklearn.utils import get_graph_kernel_by_name | |||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
| from gklearn.experiments import DATASET_ROOT | |||
| import functools | |||
| import os | |||
| import pickle | |||
| @@ -17,50 +18,77 @@ import sys | |||
| import logging | |||
| def run_all(fcsp): | |||
| save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| # def run_all(fcsp): | |||
| # from sklearn.model_selection import ParameterGrid | |||
| # Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
| # 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
| # 'Letter-high', 'Letter-med', 'Letter-low', | |||
| # 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
| # 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
| # 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
| # 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
| # 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
| # 'Mutagenicity', 'REDDIT-BINARY'] | |||
| # Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
| # task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
| # for task in list(task_grid): | |||
| from sklearn.model_selection import ParameterGrid | |||
| # save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] | |||
| # file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
| # if not os.path.isfile(file_name): | |||
| # print() | |||
| # print((task['kernel'], task['dataset'])) | |||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', | |||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
| 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
| 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
| 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
| 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
| 'Mutagenicity', 'REDDIT-BINARY'] | |||
| # try: | |||
| # gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) | |||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
| # except Exception as exp: | |||
| # print('An exception occured when running this experiment:') | |||
| # LOG_FILENAME = save_dir + 'error.txt' | |||
| # logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| # logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
| # print(repr(exp)) | |||
| # else: | |||
| # save_file_suffix = '.' + task['kernel'] + task['dataset'] | |||
| work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
| # with open(file_name, 'wb') as f: | |||
| # pickle.dump(run_time, f) | |||
| for work in list(work_grid): | |||
| save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] | |||
| file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
| if not os.path.isfile(file_name): | |||
| print() | |||
| print((work['kernel'], work['dataset'])) | |||
| try: | |||
| gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = save_dir + 'error.txt' | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception(save_file_suffix) | |||
| print(repr(exp)) | |||
| def run_task(kernel_name, ds_name, fcsp): | |||
| save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
| file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
| save_file_suffix = '.' + work['kernel'] + work['dataset'] | |||
| if not os.path.isfile(file_name): | |||
| print() | |||
| print((kernel_name, ds_name, str(fcsp))) | |||
| try: | |||
| gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
| print(repr(exp)) | |||
| else: | |||
| with open(file_name, 'wb') as f: | |||
| pickle.dump(run_time, f) | |||
| def run_work(kernel_name, ds_name, fcsp): | |||
| dataset = Dataset(ds_name, verbose=True) | |||
| def compute(kernel_name, ds_name, fcsp): | |||
| dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
| if kernel_name == 'ShortestPath': | |||
| dataset.trim_dataset(edge_required=True) | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
| @@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| fcsp = True if sys.argv[1] == 'True' else False | |||
| kernel_name = sys.argv[1] | |||
| ds_name = sys.argv[2] | |||
| fcsp = True if sys.argv[3] == 'True' else False | |||
| else: | |||
| kernel_name = 'ShortestPath' | |||
| ds_name = 'Acyclic' | |||
| fcsp = True | |||
| run_all(fcsp) | |||
| save_dir = 'outputs/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_task(kernel_name, ds_name, fcsp) | |||
| @@ -0,0 +1,98 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Dec 2 17:41:54 2020 | |||
| @author: ljia | |||
| This script compares the results with and without FCSP. | |||
| """ | |||
| from gklearn.dataset import Dataset | |||
| from shortest_path import SPSpace | |||
| from structural_sp import SSPSpace | |||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
| from gklearn.experiments import DATASET_ROOT | |||
| import functools | |||
| import os | |||
| import pickle | |||
| import sys | |||
| import logging | |||
| def run_task(kernel_name, ds_name, fcsp): | |||
| save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
| file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') | |||
| # Return if the task is already completed. | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| data = pickle.load(f) | |||
| if data['completed']: | |||
| return | |||
| print() | |||
| print((kernel_name, ds_name, str(fcsp))) | |||
| try: | |||
| gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
| print(repr(exp)) | |||
| # else: | |||
| # with open(file_name, 'wb') as f: | |||
| # pickle.dump(run_time, f) | |||
| def compute(kernel_name, ds_name, fcsp, file_name): | |||
| dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
| if kernel_name == 'ShortestPath': | |||
| dataset.trim_dataset(edge_required=True) | |||
| # dataset.cut_graphs(range(0, 10)) | |||
| kernel_class = SPSpace | |||
| else: | |||
| # dataset.cut_graphs(range(0, 10)) | |||
| kernel_class = SSPSpace | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
| edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
| graph_kernel = kernel_class(name=kernel_name, | |||
| node_labels=dataset.node_labels, | |||
| edge_labels=dataset.edge_labels, | |||
| node_attrs=dataset.node_attrs, | |||
| edge_attrs=dataset.edge_attrs, | |||
| ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||
| fcsp=fcsp, | |||
| compute_method='naive', | |||
| node_kernels=node_kernels, | |||
| edge_kernels=edge_kernels, | |||
| file_name=file_name | |||
| ) | |||
| gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||
| parallel=None, | |||
| normalize=False, | |||
| verbose=2 | |||
| ) | |||
| return gram_matrix, run_time | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| kernel_name = sys.argv[1] | |||
| ds_name = sys.argv[2] | |||
| fcsp = True if sys.argv[3] == 'True' else False | |||
| else: | |||
| kernel_name = 'StructuralSP' | |||
| ds_name = 'Fingerprint' | |||
| fcsp = True | |||
| save_dir = 'outputs/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_task(kernel_name, ds_name, fcsp) | |||
| @@ -10,27 +10,60 @@ import os | |||
| import re | |||
| def get_job_script(param): | |||
| OUT_TIME_LIST = [('ShortestPath', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'ENZYMES', 'True'), | |||
| ('StructuralSP', 'ENZYMES', 'False'), | |||
| ('StructuralSP', 'AIDS', 'False'), | |||
| ('ShortestPath', 'NCI1', 'False'), | |||
| ('StructuralSP', 'NCI1', 'True'), | |||
| ('StructuralSP', 'NCI1', 'False'), | |||
| ('ShortestPath', 'NCI109', 'False'), | |||
| ('StructuralSP', 'NCI109', 'True'), | |||
| ('StructuralSP', 'NCI109', 'False'), | |||
| ('ShortestPath', 'DD', 'True'), | |||
| ('ShortestPath', 'DD', 'False'), | |||
| ('StructuralSP', 'BZR', 'False'), | |||
| ('ShortestPath', 'COX2', 'False'), | |||
| ('StructuralSP', 'COX2', 'False'), | |||
| ('ShortestPath', 'DHFR', 'False'), | |||
| ] | |||
| OUT_MEM_LIST = [('StructuralSP', 'PROTEINS', 'True'), | |||
| ('StructuralSP', 'PROTEINS', 'False'), | |||
| ('StructuralSP', 'PROTEINS_full', 'True'), | |||
| ('StructuralSP', 'PROTEINS_full', 'False'), | |||
| ('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
| ] | |||
| MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||
| ('StructuralSP', 'GREC', 'False'), | |||
| ('StructuralSP', 'Web', 'True'), | |||
| ('StructuralSP', 'Web', 'False'), | |||
| ] | |||
| def get_job_script(kernel, dataset, fcsp): | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="fcsp.""" + param + r"""" | |||
| #SBATCH --partition=long | |||
| #SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" | |||
| #SBATCH --error="errors/error_fcsp.""" + param + r""".txt" | |||
| #SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
| #SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=100:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| #SBATCH --time=300:00:00 | |||
| ##SBATCH --mem-per-cpu=4000 | |||
| #SBATCH --mem=40000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
| srun python3 compare_fcsp.py """ + param | |||
| srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| @@ -38,15 +71,75 @@ srun python3 compare_fcsp.py """ + param | |||
| return script | |||
| def check_task_status(save_dir, *params): | |||
| str_task_id = '.' + '.'.join(params) | |||
| # Check if the task is in out of memeory or out of space lists or missing labels. | |||
| if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
| return True | |||
| # Check if the task is running or in queue of slurm. | |||
| command = 'squeue --user ljia02 --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| if len(output) > 0: | |||
| return True | |||
| # Check if the results are already computed. | |||
| file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') | |||
| if os.path.isfile(file_name): | |||
| return True | |||
| return False | |||
| if __name__ == '__main__': | |||
| save_dir = 'outputs/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs('outputs/', exist_ok=True) | |||
| os.makedirs('errors/', exist_ok=True) | |||
| param_list = ['True', 'False'] | |||
| for param in param_list[:]: | |||
| job_script = get_job_script(param) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| from sklearn.model_selection import ParameterGrid | |||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
| 'Letter-high', 'Letter-med', 'Letter-low', | |||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
| # new: not so large. | |||
| 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
| 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
| 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
| 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
| # new: large. | |||
| 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
| 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
| 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
| 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
| 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
| 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
| 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
| 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
| 'COLLAB', 'COIL-DEL', | |||
| 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
| 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
| 'REDDIT-MULTI-12K'] | |||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
| fcsp_list = ['True', 'False'] | |||
| task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
| 'dataset': Dataset_List[:], | |||
| 'fcsp': fcsp_list[:]}) | |||
| from tqdm import tqdm | |||
| for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
| if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
| job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,225 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Dec 14 11:49:43 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| import pickle | |||
| OUT_TIME_LIST = [] | |||
| OUT_MEM_LIST = [('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
| ('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
| ('ShortestPath', 'DD', 'True'), | |||
| ('ShortestPath', 'DD', 'False'), | |||
| ('ShortestPath', 'MCF-7', 'True'), | |||
| ('ShortestPath', 'MCF-7', 'False'), | |||
| ('StructuralSP', 'MCF-7', 'True'), | |||
| ('StructuralSP', 'MCF-7', 'False'), | |||
| ('ShortestPath', 'MCF-7H', 'True'), | |||
| ('ShortestPath', 'MCF-7H', 'False'), | |||
| ('StructuralSP', 'MCF-7H', 'True'), | |||
| ('StructuralSP', 'MCF-7H', 'False'), | |||
| ('ShortestPath', 'MOLT-4', 'True'), | |||
| ('ShortestPath', 'MOLT-4', 'False'), | |||
| ('StructuralSP', 'MOLT-4', 'True'), | |||
| ('StructuralSP', 'MOLT-4', 'False'), | |||
| ('ShortestPath', 'MOLT-4H', 'True'), | |||
| ('ShortestPath', 'MOLT-4H', 'False'), | |||
| ('StructuralSP', 'MOLT-4H', 'True'), | |||
| ('StructuralSP', 'MOLT-4H', 'False'), | |||
| ('ShortestPath', 'P388', 'True'), | |||
| ('ShortestPath', 'P388H', 'True'), | |||
| ('ShortestPath', 'NCI-H23', 'True'), | |||
| ('ShortestPath', 'NCI-H23', 'False'), | |||
| ('StructuralSP', 'NCI-H23', 'True'), | |||
| ('StructuralSP', 'NCI-H23', 'False'), | |||
| ('ShortestPath', 'NCI-H23H', 'True'), | |||
| ('ShortestPath', 'NCI-H23H', 'False'), | |||
| ('StructuralSP', 'NCI-H23H', 'True'), | |||
| ('StructuralSP', 'NCI-H23H', 'False'), | |||
| ('ShortestPath', 'OVCAR-8', 'True'), | |||
| ('ShortestPath', 'OVCAR-8', 'False'), | |||
| ('StructuralSP', 'OVCAR-8', 'True'), | |||
| ('StructuralSP', 'OVCAR-8', 'False'), | |||
| ('ShortestPath', 'OVCAR-8H', 'False'), | |||
| ('StructuralSP', 'OVCAR-8H', 'False'), | |||
| ('ShortestPath', 'SN12C', 'True'), | |||
| ('ShortestPath', 'SN12C', 'False'), | |||
| ('StructuralSP', 'SN12C', 'True'), | |||
| ('StructuralSP', 'SN12C', 'False'), | |||
| ('ShortestPath', 'SN12CH', 'True'), | |||
| ('ShortestPath', 'SN12CH', 'False'), | |||
| ('ShortestPath', 'SF-295', 'True'), | |||
| ('ShortestPath', 'SF-295', 'False'), | |||
| ('StructuralSP', 'SF-295', 'True'), | |||
| ('StructuralSP', 'SF-295', 'False'), | |||
| ('ShortestPath', 'SF-295H', 'False'), | |||
| ('StructuralSP', 'SF-295H', 'False'), | |||
| ('ShortestPath', 'SW-620', 'True'), | |||
| ('ShortestPath', 'SW-620', 'False'), | |||
| ('StructuralSP', 'SW-620', 'True'), | |||
| ('StructuralSP', 'SW-620', 'False'), | |||
| ('ShortestPath', 'SW-620H', 'False'), | |||
| ('StructuralSP', 'SW-620H', 'False'), | |||
| ('ShortestPath', 'TRIANGLES', 'False'), | |||
| ('StructuralSP', 'TRIANGLES', 'False'), | |||
| ('ShortestPath', 'Yeast', 'True'), | |||
| ('ShortestPath', 'Yeast', 'False'), | |||
| ('StructuralSP', 'Yeast', 'True'), | |||
| ('StructuralSP', 'Yeast', 'False'), | |||
| ('ShortestPath', 'YeastH', 'True'), | |||
| ('ShortestPath', 'FRANKENSTEIN', 'True'), | |||
| ('ShortestPath', 'FRANKENSTEIN', 'False'), | |||
| ('StructuralSP', 'FRANKENSTEIN', 'True'), | |||
| ('StructuralSP', 'FRANKENSTEIN', 'False'), | |||
| ('StructuralSP', 'SN12CH', 'True'), | |||
| ('StructuralSP', 'SN12CH', 'False'), | |||
| ('ShortestPath', 'UACC257', 'True'), | |||
| ('ShortestPath', 'UACC257', 'False'), | |||
| ('StructuralSP', 'UACC257', 'True'), | |||
| ('StructuralSP', 'UACC257', 'False'), | |||
| ('ShortestPath', 'UACC257H', 'True'), | |||
| ('ShortestPath', 'UACC257H', 'False'), | |||
| ('StructuralSP', 'UACC257H', 'True'), | |||
| ('StructuralSP', 'UACC257H', 'False'), | |||
| ('ShortestPath', 'PC-3', 'True'), | |||
| ('ShortestPath', 'PC-3', 'False'), | |||
| ('StructuralSP', 'PC-3', 'True'), | |||
| ('StructuralSP', 'PC-3', 'False'), | |||
| ('ShortestPath', 'PC-3H', 'True'), | |||
| ('ShortestPath', 'PC-3H', 'False'), | |||
| ('StructuralSP', 'PC-3H', 'True'), | |||
| ('StructuralSP', 'PC-3H', 'False'), | |||
| ('ShortestPath', 'DBLP_v1', 'False'), | |||
| ('StructuralSP', 'DBLP_v1', 'True'), | |||
| ('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
| ('ShortestPath', 'REDDIT-MULTI-12K', 'False'), | |||
| ('StructuralSP', 'REDDIT-MULTI-12K', 'False'), | |||
| ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||
| ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||
| ('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||
| ('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'), | |||
| ] | |||
| MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||
| ('StructuralSP', 'GREC', 'False'), | |||
| ('StructuralSP', 'Web', 'True'), | |||
| ('StructuralSP', 'Web', 'False'), | |||
| ] | |||
| def get_job_script(kernel, dataset, fcsp): | |||
| # if (kernel, dataset, fcsp) in OUT_MEM_LIST: | |||
| # mem = '2560000' | |||
| # else: | |||
| mem = '4000' | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
| #SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r""" | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
| #SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00 | |||
| ##SBATCH --mem-per-cpu=""" + mem + r""" | |||
| #SBATCH --mem=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
| srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| def check_task_status(save_dir, *params): | |||
| str_task_id = '.' + '.'.join(params) | |||
| # Check if the task is in out of memeory or out of space lists or missing labels. | |||
| if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
| return True | |||
| # Check if the task is running or in queue of slurm. | |||
| command = 'squeue --user ljia02 --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader' | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| if len(output) > 0: | |||
| return True | |||
| # Check if the task is already computed. | |||
| file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| data = pickle.load(f) | |||
| if data['completed']: | |||
| return True | |||
| return False | |||
| if __name__ == '__main__': | |||
| save_dir = 'outputs/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs('outputs/', exist_ok=True) | |||
| os.makedirs('errors/', exist_ok=True) | |||
| from sklearn.model_selection import ParameterGrid | |||
| Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
| 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
| 'Letter-high', 'Letter-med', 'Letter-low', | |||
| 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
| # new: not so large. | |||
| 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
| 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
| 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
| 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
| # new: large. | |||
| 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
| 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
| 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
| 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
| 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
| 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
| 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
| 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
| 'COLLAB', 'COIL-DEL', | |||
| 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
| 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
| 'REDDIT-MULTI-12K'] | |||
| Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
| fcsp_list = ['True', 'False'] | |||
| task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
| 'dataset': Dataset_List[:], | |||
| 'fcsp': fcsp_list[:]}) | |||
| from tqdm import tqdm | |||
| for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
| if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
| job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,253 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Apr 7 15:24:58 2020 | |||
| @author: ljia | |||
| @references: | |||
| [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
| Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
| """ | |||
| import sys | |||
| from itertools import product | |||
| # from functools import partial | |||
| from gklearn.utils import get_iters | |||
| import numpy as np | |||
| from gklearn.utils.utils import getSPGraph | |||
| from gklearn.kernels import ShortestPath | |||
| import os | |||
| import pickle | |||
| from pympler import asizeof | |||
| import time | |||
| import networkx as nx | |||
| def load_results(file_name, fcsp): | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| return pickle.load(f) | |||
| else: | |||
| results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
| if fcsp: | |||
| results['vk_dict_mem'] = [] | |||
| return results | |||
| def save_results(file_name, results): | |||
| with open(file_name, 'wb') as f: | |||
| pickle.dump(results, f) | |||
| def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
| # asizeof.asized(obj, detail=1).format() | |||
| # return asizeof.asizeof(obj) | |||
| key, val = next(iter(obj.items())) | |||
| # key = dict.iterkeys().next() | |||
| # key_mem = asizeof.asizeof(key) | |||
| dict_flat = sys.getsizeof(obj) | |||
| key_mem = 64 | |||
| if isinstance(val, float): | |||
| val_mem = 24 | |||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
| else: # value is True or False | |||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
| return mem | |||
| def compute_stats(file_name, results): | |||
| del results['i'] | |||
| del results['j'] | |||
| results['nb_comparison'] = np.mean(results['nb_comparison']) | |||
| results['completed'] = True | |||
| if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
| results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
| save_results(file_name, results) | |||
| class SPSpace(ShortestPath): | |||
| def __init__(self, **kwargs): | |||
| super().__init__(**kwargs) | |||
| self._file_name = kwargs.get('file_name') | |||
| # @profile | |||
| def _compute_gm_series(self): | |||
| self._all_graphs_have_edges(self._graphs) | |||
| # get shortest path graph of each graph. | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
| results = load_results(self._file_name, self._fcsp) | |||
| # compute Gram matrix. | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', | |||
| length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||
| time0 = time.time() | |||
| for i, j in iterator: | |||
| if i > results['i'] or (i == results['i'] and j > results['j']): | |||
| data = self._sp_do_space(self._graphs[i], self._graphs[j]) | |||
| if self._fcsp: | |||
| results['nb_comparison'].append(data[0]) | |||
| if data[1] != {}: | |||
| results['vk_dict_mem'].append(estimate_vk_memory(data[1], | |||
| nx.number_of_nodes(self._graphs[i]), | |||
| nx.number_of_nodes(self._graphs[j]))) | |||
| else: | |||
| results['nb_comparison'].append(data) | |||
| results['i'] = i | |||
| results['j'] = j | |||
| time1 = time.time() | |||
| if time1 - time0 > 600: | |||
| save_results(self._file_name, results) | |||
| time0 = time1 | |||
| compute_stats(self._file_name, results) | |||
| return gram_matrix | |||
| def _sp_do_space(self, g1, g2): | |||
| if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
| return self._sp_do_fcsp(g1, g2) | |||
| else: | |||
| return self._sp_do_naive(g1, g2) | |||
| def _sp_do_fcsp(self, g1, g2): | |||
| nb_comparison = 0 | |||
| # compute shortest path matrices first, method borrowed from FCSP. | |||
| vk_dict = {} # shortest path matrices dict | |||
| if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). | |||
| # node symb and non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| kn = self._node_kernels['mix'] | |||
| for n1, n2 in product( | |||
| g1.nodes(data=True), g2.nodes(data=True)): | |||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
| nb_comparison += 1 | |||
| # node symb labeled | |||
| else: | |||
| kn = self._node_kernels['symb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
| nb_comparison += 1 | |||
| else: | |||
| # node non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| kn = self._node_kernels['nsymb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
| nb_comparison += 1 | |||
| # node unlabeled | |||
| else: | |||
| for e1, e2 in product( | |||
| g1.edges(data=True), g2.edges(data=True)): | |||
| pass | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kernel += 1 | |||
| # nb_comparison += 1 | |||
| return nb_comparison, vk_dict | |||
| # # compute graph kernels | |||
| # if self._ds_infos['directed']: | |||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] | |||
| # kn1 = nk11 * nk22 | |||
| # kernel += kn1 | |||
| # else: | |||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
| # e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] | |||
| # kn1 = nk11 * nk22 | |||
| # kn2 = nk12 * nk21 | |||
| # kernel += kn1 + kn2 | |||
| def _sp_do_naive(self, g1, g2): | |||
| nb_comparison = 0 | |||
| # Define the function to compute kernels between vertices in each condition. | |||
| if len(self._node_labels) > 0: | |||
| # node symb and non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['mix'] | |||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
| return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
| # node symb labeled | |||
| else: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['symb'] | |||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
| return kn(n1_labels, n2_labels) | |||
| else: | |||
| # node non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['nsymb'] | |||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
| return kn(n1_attrs, n2_attrs) | |||
| # node unlabeled | |||
| else: | |||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kernel += 1 | |||
| return 0 | |||
| # compute graph kernels | |||
| if self._ds_infos['directed']: | |||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| # nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) | |||
| # kn1 = nk11 * nk22 | |||
| # kernel += kn1 | |||
| nb_comparison += 2 | |||
| else: | |||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| if e1[2]['cost'] == e2[2]['cost']: | |||
| # each edge walk is counted twice, starting from both its extreme nodes. | |||
| # nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( | |||
| # e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) | |||
| # kn1 = nk11 * nk22 | |||
| # kn2 = nk12 * nk21 | |||
| # kernel += kn1 + kn2 | |||
| nb_comparison += 4 | |||
| return nb_comparison | |||
| @@ -0,0 +1,439 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Mar 30 11:59:57 2020 | |||
| @author: ljia | |||
| @references: | |||
| [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For | |||
| Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | |||
| """ | |||
| import sys | |||
| from itertools import product | |||
| from gklearn.utils import get_iters | |||
| import numpy as np | |||
| import time | |||
| import os, errno | |||
| import pickle | |||
| from pympler import asizeof | |||
| import networkx as nx | |||
| from gklearn.utils.utils import get_shortest_paths | |||
| from gklearn.kernels import StructuralSP | |||
| def load_splist(file_name): | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| return pickle.load(f) | |||
| else: | |||
| results_path = {'splist': [], 'i': -1, 'completed': False} | |||
| return results_path | |||
| def load_results(file_name, fcsp): | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| return pickle.load(f) | |||
| else: | |||
| results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
| if fcsp: | |||
| results['vk_dict_mem'] = [] | |||
| results['ek_dict_mem'] = [] | |||
| return results | |||
| def save_results(file_name, results): | |||
| with open(file_name, 'wb') as f: | |||
| pickle.dump(results, f) | |||
| def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
| # asizeof.asized(obj, detail=1).format() | |||
| # return asizeof.asizeof(obj) | |||
| key, val = next(iter(obj.items())) | |||
| # key = dict.iterkeys().next() | |||
| # key_mem = asizeof.asizeof(key) | |||
| dict_flat = sys.getsizeof(obj) | |||
| key_mem = 64 | |||
| if isinstance(val, float): | |||
| val_mem = 24 | |||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
| else: # value is True or False | |||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
| return mem | |||
| def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): | |||
| # asizeof.asized(obj, detail=1).format() | |||
| # return asizeof.asizeof(obj) | |||
| key, val = next(iter(obj.items())) | |||
| # key = dict.iterkeys().next() | |||
| # key_mem = asizeof.asizeof(key) | |||
| dict_flat = sys.getsizeof(obj) | |||
| key_mem = 192 | |||
| if isinstance(val, float): | |||
| val_mem = 24 | |||
| mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
| else: # value is True or False | |||
| mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
| # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
| return mem | |||
| def compute_stats(file_name, results, splist): | |||
| del results['i'] | |||
| del results['j'] | |||
| results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) | |||
| # if len(results['nb_e_comparison']) > 0: | |||
| results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) | |||
| results['completed'] = True | |||
| if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
| results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
| if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: | |||
| results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) | |||
| results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) | |||
| results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) | |||
| results['sp_mem_all'] = asizeof.asizeof(splist) | |||
| save_results(file_name, results) | |||
| class SSPSpace(StructuralSP): | |||
| def __init__(self, **kwargs): | |||
| super().__init__(**kwargs) | |||
| self._file_name = kwargs.get('file_name') | |||
| # @profile | |||
| def _compute_gm_series(self): | |||
| # get shortest paths of each graph in the graphs. | |||
| fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' | |||
| results_path = load_splist(fn_paths) | |||
| if not results_path['completed']: | |||
| iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for g in iterator: | |||
| splist.append(self._get_sps_as_trie(g)) | |||
| else: | |||
| time0 = time.time() | |||
| for i, g in enumerate(iterator): | |||
| if i > results_path['i']: | |||
| results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) | |||
| results_path['i'] = i | |||
| time1 = time.time() | |||
| if time1 - time0 > 600: | |||
| save_results(fn_paths, results_path) | |||
| time0 = time1 | |||
| del results_path['i'] | |||
| results_path['completed'] = True | |||
| save_results(fn_paths, results_path) | |||
| ######### | |||
| splist = results_path['splist'] | |||
| results = load_results(self._file_name, self._fcsp) | |||
| # compute Gram matrix. | |||
| gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
| from itertools import combinations_with_replacement | |||
| itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
| len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
| iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
| length=len_itr, verbose=(self._verbose >= 2)) | |||
| if self._compute_method == 'trie': | |||
| for i, j in iterator: | |||
| kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
| gram_matrix[i][j] = kernel | |||
| gram_matrix[j][i] = kernel | |||
| else: | |||
| time0 = time.time() | |||
| for i, j in iterator: | |||
| if i > results['i'] or (i == results['i'] and j > results['j']): | |||
| data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
| results['nb_v_comparison'].append(data[0]) | |||
| results['nb_e_comparison'].append(data[1]) | |||
| if self._fcsp: | |||
| if data[2] != {}: | |||
| results['vk_dict_mem'].append(estimate_vk_memory(data[2], | |||
| nx.number_of_nodes(self._graphs[i]), | |||
| nx.number_of_nodes(self._graphs[j]))) | |||
| if data[3] != {}: | |||
| results['ek_dict_mem'].append(estimate_ek_memory(data[3], | |||
| nx.number_of_nodes(self._graphs[i]), | |||
| nx.number_of_nodes(self._graphs[j]))) | |||
| results['i'] = i | |||
| results['j'] = j | |||
| time1 = time.time() | |||
| if time1 - time0 > 600: | |||
| save_results(self._file_name, results) | |||
| time0 = time1 | |||
| compute_stats(self._file_name, results, splist) | |||
| # @todo: may not remove the path file if the program stops exactly here. | |||
| try: | |||
| os.remove(fn_paths) | |||
| except OSError as e: | |||
| if e.errno != errno.ENOENT: | |||
| raise | |||
| return gram_matrix | |||
| def _ssp_do_naive_space(self, g1, g2, spl1, spl2): | |||
| if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
| return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) | |||
| else: | |||
| return self._sp_do_naive_naive(g1, g2, spl1, spl2) | |||
| def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): | |||
| # First, compute shortest path matrices, method borrowed from FCSP. | |||
| vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) | |||
| # Then, compute kernels between all pairs of edges, which is an idea of | |||
| # extension of FCSP. It suits sparse graphs, which is the most case we | |||
| # went though. For dense graphs, this would be slow. | |||
| ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) | |||
| return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict | |||
| def _sp_do_naive_naive(self, g1, g2, spl1, spl2): | |||
| nb_v_comparison = 0 | |||
| nb_e_comparison = 0 | |||
| # Define the function to compute kernels between vertices in each condition. | |||
| if len(self._node_labels) > 0: | |||
| # node symb and non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['mix'] | |||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
| return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
| # node symb labeled | |||
| else: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['symb'] | |||
| n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
| n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
| return kn(n1_labels, n2_labels) | |||
| else: | |||
| # node non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| def compute_vk(n1, n2): | |||
| kn = self._node_kernels['nsymb'] | |||
| n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
| n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
| return kn(n1_attrs, n2_attrs) | |||
| # # node unlabeled | |||
| # else: | |||
| # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kernel += 1 | |||
| # return kernel | |||
| # Define the function to compute kernels between edges in each condition. | |||
| if len(self._edge_labels) > 0: | |||
| # edge symb and non-synb labeled | |||
| if len(self._edge_attrs) > 0: | |||
| def compute_ek(e1, e2): | |||
| ke = self._edge_kernels['mix'] | |||
| e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
| e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
| e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
| e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
| return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
| # edge symb labeled | |||
| else: | |||
| def compute_ek(e1, e2): | |||
| ke = self._edge_kernels['symb'] | |||
| e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
| e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
| return ke(e1_labels, e2_labels) | |||
| else: | |||
| # edge non-synb labeled | |||
| if len(self._edge_attrs) > 0: | |||
| def compute_ek(e1, e2): | |||
| ke = self._edge_kernels['nsymb'] | |||
| e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
| e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
| return ke(e1_attrs, e2_attrs) | |||
| # compute graph kernels | |||
| if len(self._node_labels) > 0 or len(self._node_attrs) > 0: | |||
| if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
| for p1, p2 in product(spl1, spl2): | |||
| if len(p1) == len(p2): | |||
| # nb_v_comparison = len(p1) | |||
| # nb_e_comparison = len(p1) - 1 | |||
| kpath = compute_vk(p1[0], p2[0]) | |||
| nb_v_comparison += 1 | |||
| if kpath: | |||
| for idx in range(1, len(p1)): | |||
| kpath *= compute_vk(p1[idx], p2[idx]) * \ | |||
| compute_ek((p1[idx-1], p1[idx]), | |||
| (p2[idx-1], p2[idx])) | |||
| nb_v_comparison += 1 | |||
| nb_e_comparison += 1 | |||
| if not kpath: | |||
| break | |||
| # kernel += kpath # add up kernels of all paths | |||
| else: | |||
| for p1, p2 in product(spl1, spl2): | |||
| if len(p1) == len(p2): | |||
| kpath = compute_vk(p1[0], p2[0]) | |||
| nb_v_comparison += 1 | |||
| if kpath: | |||
| for idx in range(1, len(p1)): | |||
| kpath *= compute_vk(p1[idx], p2[idx]) | |||
| nb_v_comparison += 1 | |||
| if not kpath: | |||
| break | |||
| # kernel += kpath # add up kernels of all paths | |||
| else: | |||
| if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
| for p1, p2 in product(spl1, spl2): | |||
| if len(p1) == len(p2): | |||
| if len(p1) == 0: | |||
| pass | |||
| else: | |||
| kpath = 1 | |||
| for idx in range(0, len(p1) - 1): | |||
| kpath *= compute_ek((p1[idx], p1[idx+1]), | |||
| (p2[idx], p2[idx+1])) | |||
| nb_e_comparison += 1 | |||
| if not kpath: | |||
| break | |||
| else: | |||
| pass | |||
| # for p1, p2 in product(spl1, spl2): | |||
| # if len(p1) == len(p2): | |||
| # kernel += 1 | |||
| # try: | |||
| # kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average | |||
| # except ZeroDivisionError: | |||
| # print(spl1, spl2) | |||
| # print(g1.nodes(data=True)) | |||
| # print(g1.edges(data=True)) | |||
| # raise Exception | |||
| return nb_v_comparison, nb_e_comparison | |||
| def _get_all_node_kernels(self, g1, g2): | |||
| nb_comparison = 0 | |||
| vk_dict = {} # shortest path matrices dict | |||
| if len(self._node_labels) > 0: | |||
| # node symb and non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| kn = self._node_kernels['mix'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
| nb_comparison += 1 | |||
| # node symb labeled | |||
| else: | |||
| kn = self._node_kernels['symb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
| n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
| nb_comparison += 1 | |||
| else: | |||
| # node non-synb labeled | |||
| if len(self._node_attrs) > 0: | |||
| kn = self._node_kernels['nsymb'] | |||
| for n1 in g1.nodes(data=True): | |||
| for n2 in g2.nodes(data=True): | |||
| n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
| n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
| vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
| nb_comparison += 1 | |||
| # node unlabeled | |||
| else: | |||
| pass # @todo: add edge weights. | |||
| # for e1 in g1.edges(data=True): | |||
| # for e2 in g2.edges(data=True): | |||
| # if e1[2]['cost'] == e2[2]['cost']: | |||
| # kernel += 1 | |||
| # return kernel | |||
| return vk_dict, nb_comparison | |||
| def _get_all_edge_kernels(self, g1, g2): | |||
| nb_comparison = 0 | |||
| # compute kernels between all pairs of edges, which is an idea of | |||
| # extension of FCSP. It suits sparse graphs, which is the most case we | |||
| # went though. For dense graphs, this would be slow. | |||
| ek_dict = {} # dict of edge kernels | |||
| if len(self._edge_labels) > 0: | |||
| # edge symb and non-synb labeled | |||
| if len(self._edge_attrs) > 0: | |||
| ke = self._edge_kernels['mix'] | |||
| for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
| e1_labels = [e1[2][el] for el in self._edge_labels] | |||
| e2_labels = [e2[2][el] for el in self._edge_labels] | |||
| e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
| e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
| ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
| nb_comparison += 1 | |||
| # edge symb labeled | |||
| else: | |||
| ke = self._edge_kernels['symb'] | |||
| for e1 in g1.edges(data=True): | |||
| for e2 in g2.edges(data=True): | |||
| e1_labels = [e1[2][el] for el in self._edge_labels] | |||
| e2_labels = [e2[2][el] for el in self._edge_labels] | |||
| ek_temp = ke(e1_labels, e2_labels) | |||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
| nb_comparison += 1 | |||
| else: | |||
| # edge non-synb labeled | |||
| if len(self._edge_attrs) > 0: | |||
| ke = self._edge_kernels['nsymb'] | |||
| for e1 in g1.edges(data=True): | |||
| for e2 in g2.edges(data=True): | |||
| e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
| e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
| ek_temp = ke(e1_attrs, e2_attrs) | |||
| ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
| ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
| ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
| nb_comparison += 1 | |||
| # edge unlabeled | |||
| else: | |||
| pass | |||
| return ek_dict, nb_comparison | |||