| @@ -0,0 +1,389 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri May 29 14:29:52 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import time | |||
| import sys | |||
| from tqdm import tqdm | |||
| import multiprocessing | |||
| import networkx as nx | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| from gklearn.preimage import PreimageGenerator | |||
| from gklearn.preimage.utils import compute_k_dis | |||
| from gklearn.utils import Timer | |||
| from gklearn.utils.utils import get_graph_kernel_by_name | |||
| # from gklearn.utils.dataset import Dataset | |||
| class RandomPreimageGenerator(PreimageGenerator): | |||
| def __init__(self, dataset=None): | |||
| PreimageGenerator.__init__(self, dataset=dataset) | |||
| # arguments to set. | |||
| self.__k = 5 # number of nearest neighbors of phi in D_N. | |||
| self.__r_max = 10 # maximum number of iterations. | |||
| self.__l = 500 # numbers of graphs generated for each graph in D_k U {g_i_hat}. | |||
| self.__alphas = None # weights of linear combinations of points in kernel space. | |||
| self.__parallel = True | |||
| self.__n_jobs = multiprocessing.cpu_count() | |||
| self.__time_limit_in_sec = 0 | |||
| self.__max_itrs = 20 | |||
| # values to compute. | |||
| self.__runtime_generate_preimage = None | |||
| self.__runtime_total = None | |||
| self.__preimage = None | |||
| self.__best_from_dataset = None | |||
| self.__k_dis_preimage = None | |||
| self.__k_dis_dataset = None | |||
| self.__itrs = 0 | |||
| self.__converged = False # @todo | |||
| self.__num_updates = 0 | |||
| # values that can be set or to be computed. | |||
| self.__gram_matrix_unnorm = None | |||
| self.__runtime_precompute_gm = None | |||
| def set_options(self, **kwargs): | |||
| self._kernel_options = kwargs.get('kernel_options', {}) | |||
| self._graph_kernel = kwargs.get('graph_kernel', None) | |||
| self._verbose = kwargs.get('verbose', 2) | |||
| self.__k = kwargs.get('k', 5) | |||
| self.__r_max = kwargs.get('r_max', 10) | |||
| self.__l = kwargs.get('l', 500) | |||
| self.__alphas = kwargs.get('alphas', None) | |||
| self.__parallel = kwargs.get('parallel', True) | |||
| self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
| self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) | |||
| self.__max_itrs = kwargs.get('max_itrs', 20) | |||
| self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) | |||
| self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) | |||
| def run(self): | |||
| self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], | |||
| node_labels=self._dataset.node_labels, | |||
| edge_labels=self._dataset.edge_labels, | |||
| node_attrs=self._dataset.node_attrs, | |||
| edge_attrs=self._dataset.edge_attrs, | |||
| ds_infos=self._dataset.get_dataset_infos(keys=['directed']), | |||
| kernel_options=self._kernel_options) | |||
| # record start time. | |||
| start = time.time() | |||
| # 1. precompute gram matrix. | |||
| if self.__gram_matrix_unnorm is None: | |||
| gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) | |||
| self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm | |||
| end_precompute_gm = time.time() | |||
| self.__runtime_precompute_gm = end_precompute_gm - start | |||
| else: | |||
| if self.__runtime_precompute_gm is None: | |||
| raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') | |||
| self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm | |||
| if self._kernel_options['normalize']: | |||
| self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) | |||
| else: | |||
| self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) | |||
| end_precompute_gm = time.time() | |||
| start -= self.__runtime_precompute_gm | |||
| # 2. compute k nearest neighbors of phi in D_N. | |||
| if self._verbose >= 2: | |||
| print('\nstart computing k nearest neighbors of phi in D_N...\n') | |||
| D_N = self._dataset.graphs | |||
| if self.__alphas is None: | |||
| self.__alphas = [1 / len(D_N)] * len(D_N) | |||
| k_dis_list = [] # distance between g_star and each graph. | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(self.__alphas): | |||
| for i2, a2 in enumerate(self.__alphas): | |||
| term3 += a1 * a2 * self._graph_kernel.gram_matrix[i1, i2] | |||
| for idx in range(len(D_N)): | |||
| k_dis_list.append(compute_k_dis(idx, range(0, len(D_N)), self.__alphas, self._graph_kernel.gram_matrix, term3=term3, withterm3=True)) | |||
| # sort. | |||
| sort_idx = np.argsort(k_dis_list) | |||
| dis_gs = [k_dis_list[idis] for idis in sort_idx[0:self.__k]] # the k shortest distances. | |||
| nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||
| g0hat_list = [D_N[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in D_N | |||
| self.__best_from_dataset = g0hat_list[0] # get the first best graph if there are muitlple. | |||
| self.__k_dis_dataset = dis_gs[0] | |||
| if self.__k_dis_dataset == 0: # get the exact pre-image. | |||
| end_generate_preimage = time.time() | |||
| self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||
| self.__runtime_total = end_generate_preimage - start | |||
| self.__preimage = self.__best_from_dataset.copy() | |||
| self.__k_dis_preimage = self.__k_dis_dataset | |||
| if self._verbose: | |||
| print() | |||
| print('=============================================================================') | |||
| print('The exact pre-image is found from the input dataset.') | |||
| print('-----------------------------------------------------------------------------') | |||
| print('Distance in kernel space for the best graph from dataset and for preimage:', self.__k_dis_dataset) | |||
| print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||
| print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||
| print('Total time:', self.__runtime_total) | |||
| print('=============================================================================') | |||
| print() | |||
| return | |||
| dhat = dis_gs[0] # the nearest distance | |||
| Gk = [D_N[ig].copy() for ig in sort_idx[0:self.__k]] # the k nearest neighbors | |||
| Gs_nearest = [nx.convert_node_labels_to_integers(g) for g in Gk] # [g.copy() for g in Gk] | |||
| # 3. start iterations. | |||
| if self._verbose >= 2: | |||
| print('starting iterations...') | |||
| gihat_list = [] | |||
| dihat_list = [] | |||
| r = 0 | |||
| dis_of_each_itr = [dhat] | |||
| if self.__parallel: | |||
| self._kernel_options['parallel'] = None | |||
| self.__itrs = 0 | |||
| self.__num_updates = 0 | |||
| timer = Timer(self.__time_limit_in_sec) | |||
| while not self.__termination_criterion_met(timer, self.__itrs, r): | |||
| print('\n- r =', r) | |||
| found = False | |||
| dis_bests = dis_gs + dihat_list | |||
| # compute numbers of edges to be inserted/deleted. | |||
| # @todo what if the log is negetive? how to choose alpha (scalar)? | |||
| fdgs_list = np.array(dis_bests) | |||
| if np.min(fdgs_list) < 1: # in case the log is negetive. | |||
| fdgs_list /= np.min(fdgs_list) | |||
| fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] | |||
| if np.min(fdgs_list) < 1: # in case the log is smaller than 1. | |||
| fdgs_list = np.array(fdgs_list) + 1 | |||
| # expand the number of modifications to increase the possiblity. | |||
| nb_vpairs_list = [nx.number_of_nodes(g) * (nx.number_of_nodes(g) - 1) for g in (Gs_nearest + gihat_list)] | |||
| nb_vpairs_min = np.min(nb_vpairs_list) | |||
| idx_fdgs_max = np.argmax(fdgs_list) | |||
| fdgs_max_old = fdgs_list[idx_fdgs_max] | |||
| fdgs_max = fdgs_max_old | |||
| nb_modif = 1 | |||
| for idx, nb in enumerate(range(nb_vpairs_min, nb_vpairs_min - fdgs_max, -1)): | |||
| nb_modif *= nb / (fdgs_max - idx) | |||
| while fdgs_max < nb_vpairs_min and nb_modif < self.__l: | |||
| fdgs_max += 1 | |||
| nb_modif *= (nb_vpairs_min - fdgs_max + 1) / fdgs_max | |||
| nb_increase = int(fdgs_max - fdgs_max_old) | |||
| if nb_increase > 0: | |||
| fdgs_list += 1 | |||
| for ig, gs in enumerate(Gs_nearest + gihat_list): | |||
| if self._verbose >= 2: | |||
| print('-- computing', ig + 1, 'graphs out of', len(Gs_nearest) + len(gihat_list)) | |||
| gnew, dhat, found = self.__generate_l_graphs(gs, fdgs_list[ig], dhat, ig, found, term3) | |||
| if found: | |||
| r = 0 | |||
| gihat_list = [gnew] | |||
| dihat_list = [dhat] | |||
| else: | |||
| r += 1 | |||
| dis_of_each_itr.append(dhat) | |||
| self.__itrs += 1 | |||
| if self._verbose >= 2: | |||
| print('Total number of iterations is', self.__itrs, '.') | |||
| print('The preimage is updated', self.__num_updates, 'times.') | |||
| print('The shortest distances for previous iterations are', dis_of_each_itr, '.') | |||
| # get results and print. | |||
| end_generate_preimage = time.time() | |||
| self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||
| self.__runtime_total = end_generate_preimage - start | |||
| self.__preimage = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) | |||
| self.__k_dis_preimage = dhat | |||
| if self._verbose: | |||
| print() | |||
| print('=============================================================================') | |||
| print('Finished generation of preimages.') | |||
| print('-----------------------------------------------------------------------------') | |||
| print('Distance in kernel space for the best graph from dataset:', self.__k_dis_dataset) | |||
| print('Distance in kernel space for the preimage:', self.__k_dis_preimage) | |||
| print('Total number of iterations for optimizing:', self.__itrs) | |||
| print('Total number of updating preimage:', self.__num_updates) | |||
| print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||
| print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||
| print('Total time:', self.__runtime_total) | |||
| print('=============================================================================') | |||
| print() | |||
| def __generate_l_graphs(self, g_init, fdgs, dhat, ig, found, term3): | |||
| if self.__parallel: | |||
| gnew, dhat, found = self.__generate_l_graphs_parallel(g_init, fdgs, dhat, ig, found, term3) | |||
| else: | |||
| gnew, dhat, found = self.__generate_l_graphs_series(g_init, fdgs, dhat, ig, found, term3) | |||
| return gnew, dhat, found | |||
| def __generate_l_graphs_series(self, g_init, fdgs, dhat, ig, found, term3): | |||
| gnew = None | |||
| updated = False | |||
| for trial in range(0, self.__l): | |||
| if self._verbose >= 2: | |||
| print('---', trial + 1, 'trial out of', self.__l) | |||
| gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||
| # get the better graph preimage. | |||
| if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||
| if dhat - dnew > 1e-6: | |||
| if self._verbose >= 2: | |||
| print('trial =', str(trial)) | |||
| print('\nI am smaller!') | |||
| print('index (as in D_k U {gihat} =', str(ig)) | |||
| print('distance:', dhat, '->', dnew) | |||
| updated = True | |||
| else: | |||
| if self._verbose >= 2: | |||
| print('I am equal!') | |||
| dhat = dnew | |||
| gnew = gtemp.copy() | |||
| found = True # found better or equally good graph. | |||
| if updated: | |||
| self.__num_updates += 1 | |||
| return gnew, dhat, found | |||
| def __generate_l_graphs_parallel(self, g_init, fdgs, dhat, ig, found, term3): | |||
| gnew = None | |||
| len_itr = self.__l | |||
| gnew_list = [None] * len_itr | |||
| dnew_list = [None] * len_itr | |||
| itr = range(0, len_itr) | |||
| n_jobs = multiprocessing.cpu_count() | |||
| if len_itr < 100 * n_jobs: | |||
| chunksize = int(len_itr / n_jobs) + 1 | |||
| else: | |||
| chunksize = 100 | |||
| do_fun = partial(self._generate_graph_parallel, g_init, fdgs, term3) | |||
| pool = Pool(processes=n_jobs) | |||
| if self._verbose >= 2: | |||
| iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), | |||
| desc='Generating l graphs', file=sys.stdout) | |||
| else: | |||
| iterator = pool.imap_unordered(do_fun, itr, chunksize) | |||
| for idx, gnew, dnew in iterator: | |||
| gnew_list[idx] = gnew | |||
| dnew_list[idx] = dnew | |||
| pool.close() | |||
| pool.join() | |||
| # check if get the better graph preimage. | |||
| idx_min = np.argmin(dnew_list) | |||
| dnew = dnew_list[idx_min] | |||
| if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||
| if dhat - dnew > 1e-6: # @todo: use a proportion and watch out for 0. | |||
| if self._verbose >= 2: | |||
| print('I am smaller!') | |||
| print('index (as in D_k U {gihat}) =', str(ig)) | |||
| print('distance:', dhat, '->', dnew, '\n') | |||
| self.__num_updates += 1 | |||
| else: | |||
| if self._verbose >= 2: | |||
| print('I am equal!') | |||
| dhat = dnew | |||
| gnew = gnew_list[idx_min] | |||
| found = True # found better graph. | |||
| return gnew, dhat, found | |||
| def _generate_graph_parallel(self, g_init, fdgs, term3, itr): | |||
| trial = itr | |||
| gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||
| return trial, gtemp, dnew | |||
| def __do_trial(self, g_init, fdgs, term3, trial): | |||
| # add and delete edges. | |||
| gtemp = g_init.copy() | |||
| seed = (trial + int(time.time())) % (2 ** 32 - 1) | |||
| rdm_state = np.random.RandomState(seed=seed) | |||
| # which edges to change. | |||
| # @todo: should we use just half of the adjacency matrix for undirected graphs? | |||
| nb_vpairs = nx.number_of_nodes(g_init) * (nx.number_of_nodes(g_init) - 1) | |||
| # @todo: what if fdgs is bigger than nb_vpairs? | |||
| idx_change = rdm_state.randint(0, high=nb_vpairs, size=(fdgs if | |||
| fdgs < nb_vpairs else nb_vpairs)) | |||
| # print(idx_change) | |||
| for item in idx_change: | |||
| node1 = int(item / (nx.number_of_nodes(g_init) - 1)) | |||
| node2 = (item - node1 * (nx.number_of_nodes(g_init) - 1)) | |||
| if node2 >= node1: # skip the self pair. | |||
| node2 += 1 | |||
| # @todo: is the randomness correct? | |||
| if not gtemp.has_edge(node1, node2): | |||
| gtemp.add_edge(node1, node2) | |||
| else: | |||
| gtemp.remove_edge(node1, node2) | |||
| # compute new distances. | |||
| kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, self._dataset.graphs, **self._kernel_options) | |||
| kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options) | |||
| if self._kernel_options['normalize']: | |||
| kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize | |||
| kernel_gtmp = 1 | |||
| # @todo: not correct kernel value | |||
| gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
| gram_with_gtmp = np.concatenate((np.array([[kernel_gtmp] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1) | |||
| dnew = compute_k_dis(0, range(1, 1 + len(self._dataset.graphs)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True) | |||
| return gtemp, dnew | |||
| def get_results(self): | |||
| results = {} | |||
| results['runtime_precompute_gm'] = self.__runtime_precompute_gm | |||
| results['runtime_generate_preimage'] = self.__runtime_generate_preimage | |||
| results['runtime_total'] = self.__runtime_total | |||
| results['k_dis_dataset'] = self.__k_dis_dataset | |||
| results['k_dis_preimage'] = self.__k_dis_preimage | |||
| results['itrs'] = self.__itrs | |||
| results['num_updates'] = self.__num_updates | |||
| return results | |||
| def __termination_criterion_met(self, timer, itr, r): | |||
| if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||
| # if self.__state == AlgorithmState.TERMINATED: | |||
| # self.__state = AlgorithmState.INITIALIZED | |||
| return True | |||
| return (r >= self.__r_max if self.__r_max >= 0 else False) | |||
| # return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||
| @property | |||
| def preimage(self): | |||
| return self.__preimage | |||
| @property | |||
| def best_from_dataset(self): | |||
| return self.__best_from_dataset | |||
| @property | |||
| def gram_matrix_unnorm(self): | |||
| return self.__gram_matrix_unnorm | |||
| @gram_matrix_unnorm.setter | |||
| def gram_matrix_unnorm(self, value): | |||
| self.__gram_matrix_unnorm = value | |||