| @@ -0,0 +1,186 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Oct 5 16:08:33 2020 | |||||
| @author: ljia | |||||
| This script compute classification accuracy of each geaph kernel on datasets | |||||
| with different entropy of degree distribution. | |||||
| """ | |||||
| from utils import Graph_Kernel_List, cross_validate | |||||
| import numpy as np | |||||
| import logging | |||||
| num_nodes = 40 | |||||
| half_num_graphs = 100 | |||||
| def generate_graphs(): | |||||
| # from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||||
| # gsyzer = GraphSynthesizer() | |||||
| # graphs = gsyzer.unified_graphs(num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||||
| # return graphs | |||||
| import networkx as nx | |||||
| degrees11 = [5] * num_nodes | |||||
| # degrees12 = [2] * num_nodes | |||||
| degrees12 = [5] * num_nodes | |||||
| degrees21 = list(range(1, 11)) * 6 | |||||
| # degrees22 = [5 * i for i in list(range(1, 11)) * 6] | |||||
| degrees22 = list(range(1, 11)) * 6 | |||||
| # method 1 | |||||
| graphs11 = [nx.configuration_model(degrees11, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| graphs12 = [nx.configuration_model(degrees12, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| # method 2: can easily generate isomorphic graphs. | |||||
| # graphs11 = [nx.random_regular_graph(2, num_nodes, seed=None) for i in range(half_num_graphs)] | |||||
| # graphs12 = [nx.random_regular_graph(10, num_nodes, seed=None) for i in range(half_num_graphs)] | |||||
| # Add node labels. | |||||
| for g in graphs11: | |||||
| for n in g.nodes(): | |||||
| g.nodes[n]['atom'] = 0 | |||||
| for g in graphs12: | |||||
| for n in g.nodes(): | |||||
| g.nodes[n]['atom'] = 1 | |||||
| graphs1 = graphs11 + graphs12 | |||||
| # method 1: the entorpy of the two classes is not the same. | |||||
| graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| graphs22 = [nx.configuration_model(degrees22, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| # # method 2: tooo slow, and may fail. | |||||
| # graphs21 = [nx.random_degree_sequence_graph(degrees21, seed=None, tries=100) for i in range(half_num_graphs)] | |||||
| # graphs22 = [nx.random_degree_sequence_graph(degrees22, seed=None, tries=100) for i in range(half_num_graphs)] | |||||
| # # method 3: no randomness. | |||||
| # graphs21 = [nx.havel_hakimi_graph(degrees21, create_using=None) for i in range(half_num_graphs)] | |||||
| # graphs22 = [nx.havel_hakimi_graph(degrees22, create_using=None) for i in range(half_num_graphs)] | |||||
| # # method 4: | |||||
| # graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| # graphs22 = [nx.degree_sequence_tree(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] | |||||
| # # method 5: the entorpy of the two classes is not the same. | |||||
| # graphs21 = [nx.expected_degree_graph(degrees21, seed=None, selfloops=False) for i in range(half_num_graphs)] | |||||
| # graphs22 = [nx.expected_degree_graph(degrees22, seed=None, selfloops=False) for i in range(half_num_graphs)] | |||||
| # # method 6: seems there is no randomness0 | |||||
| # graphs21 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] | |||||
| # graphs22 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] | |||||
| # Add node labels. | |||||
| for g in graphs21: | |||||
| for n in g.nodes(): | |||||
| g.nodes[n]['atom'] = 0 | |||||
| for g in graphs22: | |||||
| for n in g.nodes(): | |||||
| g.nodes[n]['atom'] = 1 | |||||
| graphs2 = graphs21 + graphs22 | |||||
| # # check for isomorphism. | |||||
| # iso_mat1 = np.zeros((len(graphs1), len(graphs1))) | |||||
| # num1 = 0 | |||||
| # num2 = 0 | |||||
| # for i in range(len(graphs1)): | |||||
| # for j in range(i + 1, len(graphs1)): | |||||
| # if nx.is_isomorphic(graphs1[i], graphs1[j]): | |||||
| # iso_mat1[i, j] = 1 | |||||
| # iso_mat1[j, i] = 1 | |||||
| # num1 += 1 | |||||
| # print('iso:', num1, ':', i, ',', j) | |||||
| # else: | |||||
| # num2 += 1 | |||||
| # print('not iso:', num2, ':', i, ',', j) | |||||
| # | |||||
| # iso_mat2 = np.zeros((len(graphs2), len(graphs2))) | |||||
| # num1 = 0 | |||||
| # num2 = 0 | |||||
| # for i in range(len(graphs2)): | |||||
| # for j in range(i + 1, len(graphs2)): | |||||
| # if nx.is_isomorphic(graphs2[i], graphs2[j]): | |||||
| # iso_mat2[i, j] = 1 | |||||
| # iso_mat2[j, i] = 1 | |||||
| # num1 += 1 | |||||
| # print('iso:', num1, ':', i, ',', j) | |||||
| # else: | |||||
| # num2 += 1 | |||||
| # print('not iso:', num2, ':', i, ',', j) | |||||
| return graphs1, graphs2 | |||||
| def get_infos(graph): | |||||
| from gklearn.utils import Dataset | |||||
| ds = Dataset() | |||||
| ds.load_graphs(graph) | |||||
| infos = ds.get_dataset_infos(keys=['all_degree_entropy', 'ave_node_degree']) | |||||
| infos['ave_degree_entropy'] = np.mean(infos['all_degree_entropy']) | |||||
| print(infos['ave_degree_entropy'], ',', infos['ave_node_degree']) | |||||
| return infos | |||||
| def xp_accuracy_diff_entropy(): | |||||
| # Generate graphs. | |||||
| graphs1, graphs2 = generate_graphs() | |||||
| # Compute entropy of degree distribution of the generated graphs. | |||||
| info11 = get_infos(graphs1[0:half_num_graphs]) | |||||
| info12 = get_infos(graphs1[half_num_graphs:]) | |||||
| info21 = get_infos(graphs2[0:half_num_graphs]) | |||||
| info22 = get_infos(graphs2[half_num_graphs:]) | |||||
| # Run and save. | |||||
| import pickle | |||||
| import os | |||||
| save_dir = 'outputs/accuracy_diff_entropy/' | |||||
| if not os.path.exists(save_dir): | |||||
| os.makedirs(save_dir) | |||||
| accuracies = {} | |||||
| confidences = {} | |||||
| for kernel_name in Graph_Kernel_List: | |||||
| print() | |||||
| print('Kernel:', kernel_name) | |||||
| accuracies[kernel_name] = [] | |||||
| confidences[kernel_name] = [] | |||||
| for set_i, graphs in enumerate([graphs1, graphs2]): | |||||
| print() | |||||
| print('Graph set', set_i) | |||||
| tmp_graphs = [g.copy() for g in graphs] | |||||
| targets = [0] * half_num_graphs + [1] * half_num_graphs | |||||
| accuracy = 'error' | |||||
| confidence = 'error' | |||||
| try: | |||||
| accuracy, confidence = cross_validate(tmp_graphs, targets, kernel_name, ds_name=str(set_i), output_dir=save_dir) #, n_jobs=1) | |||||
| except Exception as exp: | |||||
| print('An exception occured when running this experiment:') | |||||
| LOG_FILENAME = save_dir + 'error.txt' | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
| logging.exception('\n' + kernel_name + ', ' + str(set_i) + ':') | |||||
| print(repr(exp)) | |||||
| accuracies[kernel_name].append(accuracy) | |||||
| confidences[kernel_name].append(confidence) | |||||
| pickle.dump(accuracy, open(save_dir + 'accuracy.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) | |||||
| pickle.dump(confidence, open(save_dir + 'confidence.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) | |||||
| # Save all. | |||||
| pickle.dump(accuracies, open(save_dir + 'accuracies.pkl', 'wb')) | |||||
| pickle.dump(confidences, open(save_dir + 'confidences.pkl', 'wb')) | |||||
| return | |||||
| if __name__ == '__main__': | |||||
| xp_accuracy_diff_entropy() | |||||
| @@ -21,14 +21,14 @@ def xp_runtimes_of_all_28cores(): | |||||
| run_times = {} | run_times = {} | ||||
| for kernel_name in Graph_Kernel_List: | |||||
| for ds_name in Dataset_List: | |||||
| print() | print() | ||||
| print('Kernel:', kernel_name) | |||||
| print('Dataset:', ds_name) | |||||
| run_times[kernel_name] = [] | |||||
| for ds_name in Dataset_List: | |||||
| run_times[ds_name] = [] | |||||
| for kernel_name in Graph_Kernel_List: | |||||
| print() | print() | ||||
| print('Dataset:', ds_name) | |||||
| print('Kernel:', kernel_name) | |||||
| # get graphs. | # get graphs. | ||||
| graphs, _ = load_predefined_dataset(ds_name) | graphs, _ = load_predefined_dataset(ds_name) | ||||
| @@ -43,7 +43,7 @@ def xp_runtimes_of_all_28cores(): | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | ||||
| logging.exception('') | logging.exception('') | ||||
| print(repr(exp)) | print(repr(exp)) | ||||
| run_times[kernel_name].append(run_time) | |||||
| run_times[ds_name].append(run_time) | |||||
| pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) | pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) | ||||
| @@ -20,17 +20,17 @@ def xp_runtimes_diff_chunksizes(): | |||||
| os.makedirs(save_dir) | os.makedirs(save_dir) | ||||
| run_times = {} | run_times = {} | ||||
| for kernel_name in Graph_Kernel_List: | |||||
| for ds_name in Dataset_List: | |||||
| print() | print() | ||||
| print('Kernel:', kernel_name) | |||||
| run_times[kernel_name] = [] | |||||
| for ds_name in Dataset_List: | |||||
| print('Dataset:', ds_name) | |||||
| run_times[ds_name] = [] | |||||
| for kernel_name in Graph_Kernel_List: | |||||
| print() | print() | ||||
| print('Dataset:', ds_name) | |||||
| print('Kernel:', kernel_name) | |||||
| run_times[kernel_name].append([]) | |||||
| run_times[ds_name].append([]) | |||||
| for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: | for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: | ||||
| print() | print() | ||||
| print('Chunksize:', chunksize) | print('Chunksize:', chunksize) | ||||
| @@ -48,7 +48,7 @@ def xp_runtimes_diff_chunksizes(): | |||||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | ||||
| logging.exception('') | logging.exception('') | ||||
| print(repr(exp)) | print(repr(exp)) | ||||
| run_times[kernel_name][-1].append(run_time) | |||||
| run_times[ds_name][-1].append(run_time) | |||||
| pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) | pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) | ||||
| @@ -16,7 +16,7 @@ def generate_graphs(): | |||||
| return graphs | return graphs | ||||
| def xp_synthesied_graphs_dataset_size(): | |||||
| def xp_synthesized_graphs_dataset_size(): | |||||
| # Generate graphs. | # Generate graphs. | ||||
| graphs = generate_graphs() | graphs = generate_graphs() | ||||
| @@ -61,4 +61,4 @@ def xp_synthesied_graphs_dataset_size(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| xp_synthesied_graphs_dataset_size() | |||||
| xp_synthesized_graphs_dataset_size() | |||||
| @@ -16,7 +16,7 @@ def generate_graphs(degree): | |||||
| return graphs | return graphs | ||||
| def xp_synthesied_graphs_degrees(): | |||||
| def xp_synthesized_graphs_degrees(): | |||||
| # Run and save. | # Run and save. | ||||
| import pickle | import pickle | ||||
| @@ -60,4 +60,4 @@ def xp_synthesied_graphs_degrees(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| xp_synthesied_graphs_degrees() | |||||
| xp_synthesized_graphs_degrees() | |||||
| @@ -16,7 +16,7 @@ def generate_graphs(num_el_alp): | |||||
| return graphs | return graphs | ||||
| def xp_synthesied_graphs_num_edge_label_alphabet(): | |||||
| def xp_synthesized_graphs_num_edge_label_alphabet(): | |||||
| # Run and save. | # Run and save. | ||||
| import pickle | import pickle | ||||
| @@ -60,4 +60,4 @@ def xp_synthesied_graphs_num_edge_label_alphabet(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| xp_synthesied_graphs_num_edge_label_alphabet() | |||||
| xp_synthesized_graphs_num_edge_label_alphabet() | |||||
| @@ -16,7 +16,7 @@ def generate_graphs(num_nl_alp): | |||||
| return graphs | return graphs | ||||
| def xp_synthesied_graphs_num_node_label_alphabet(): | |||||
| def xp_synthesized_graphs_num_node_label_alphabet(): | |||||
| # Run and save. | # Run and save. | ||||
| import pickle | import pickle | ||||
| @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_node_label_alphabet(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| xp_synthesied_graphs_num_node_label_alphabet() | |||||
| xp_synthesized_graphs_num_node_label_alphabet() | |||||
| @@ -16,7 +16,7 @@ def generate_graphs(num_nodes): | |||||
| return graphs | return graphs | ||||
| def xp_synthesied_graphs_num_nodes(): | |||||
| def xp_synthesized_graphs_num_nodes(): | |||||
| # Run and save. | # Run and save. | ||||
| import pickle | import pickle | ||||
| @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_nodes(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| xp_synthesied_graphs_num_nodes() | |||||
| xp_synthesized_graphs_num_nodes() | |||||
| @@ -6,6 +6,8 @@ Created on Tue Sep 22 11:33:28 2020 | |||||
| @author: ljia | @author: ljia | ||||
| """ | """ | ||||
| import multiprocessing | import multiprocessing | ||||
| import numpy as np | |||||
| from gklearn.utils import model_selection_for_precomputed_kernel | |||||
| Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk'] | Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk'] | ||||
| @@ -109,4 +111,123 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() | |||||
| params['verbose'] = True | params['verbose'] = True | ||||
| results = estimator(graphs, **params) | results = estimator(graphs, **params) | ||||
| return results[0], results[1] | |||||
| def cross_validate(graphs, targets, kernel_name, output_dir='outputs/', ds_name='synthesized', n_jobs=multiprocessing.cpu_count()): | |||||
| param_grid = None | |||||
| if kernel_name == 'CommonWalk': | |||||
| from gklearn.kernels.commonWalkKernel import commonwalkkernel | |||||
| estimator = commonwalkkernel | |||||
| param_grid_precomputed = [{'compute_method': ['geo'], | |||||
| 'weight': np.linspace(0.01, 0.15, 15)}] | |||||
| elif kernel_name == 'Marginalized': | |||||
| from gklearn.kernels.marginalizedKernel import marginalizedkernel | |||||
| estimator = marginalizedkernel | |||||
| param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | |||||
| 'n_iteration': np.linspace(1, 19, 7), | |||||
| 'remove_totters': [False]} | |||||
| elif kernel_name == 'SylvesterEquation': | |||||
| from gklearn.kernels.randomWalkKernel import randomwalkkernel | |||||
| estimator = randomwalkkernel | |||||
| param_grid_precomputed = {'compute_method': ['sylvester'], | |||||
| # 'weight': np.linspace(0.01, 0.10, 10)} | |||||
| 'weight': np.logspace(-1, -10, num=10, base=10)} | |||||
| elif kernel_name == 'ConjugateGradient': | |||||
| from gklearn.kernels.randomWalkKernel import randomwalkkernel | |||||
| estimator = randomwalkkernel | |||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| import functools | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| param_grid_precomputed = {'compute_method': ['conjugate'], | |||||
| 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], | |||||
| 'weight': np.logspace(-1, -10, num=10, base=10)} | |||||
| elif kernel_name == 'FixedPoint': | |||||
| from gklearn.kernels.randomWalkKernel import randomwalkkernel | |||||
| estimator = randomwalkkernel | |||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| import functools | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| param_grid_precomputed = {'compute_method': ['fp'], | |||||
| 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], | |||||
| 'weight': np.logspace(-3, -10, num=8, base=10)} | |||||
| elif kernel_name == 'SpectralDecomposition': | |||||
| from gklearn.kernels.randomWalkKernel import randomwalkkernel | |||||
| estimator = randomwalkkernel | |||||
| param_grid_precomputed = {'compute_method': ['spectral'], | |||||
| 'weight': np.logspace(-1, -10, num=10, base=10), | |||||
| 'sub_kernel': ['geo', 'exp']} | |||||
| elif kernel_name == 'ShortestPath': | |||||
| from gklearn.kernels.spKernel import spkernel | |||||
| estimator = spkernel | |||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| import functools | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| param_grid_precomputed = {'node_kernels': [sub_kernel]} | |||||
| elif kernel_name == 'StructuralSP': | |||||
| from gklearn.kernels.structuralspKernel import structuralspkernel | |||||
| estimator = structuralspkernel | |||||
| from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| import functools | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
| param_grid_precomputed = {'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], | |||||
| 'compute_method': ['naive']} | |||||
| elif kernel_name == 'PathUpToH': | |||||
| from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||||
| estimator = untilhpathkernel | |||||
| param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||||
| 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # | |||||
| 'compute_method': ['trie']} # ['MinMax']} | |||||
| elif kernel_name == 'Treelet': | |||||
| from gklearn.kernels.treeletKernel import treeletkernel | |||||
| estimator = treeletkernel | |||||
| from gklearn.utils.kernels import polynomialkernel | |||||
| import functools | |||||
| gkernels = [functools.partial(gaussiankernel, gamma=1 / ga) | |||||
| # for ga in np.linspace(1, 10, 10)] | |||||
| for ga in np.logspace(0, 10, num=11, base=10)] | |||||
| pkernels = [functools.partial(polynomialkernel, d=d, c=c) for d in range(1, 11) | |||||
| for c in np.logspace(0, 10, num=11, base=10)] | |||||
| param_grid_precomputed = {'sub_kernel': pkernels + gkernels} | |||||
| elif kernel_name == 'WLSubtree': | |||||
| from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||||
| estimator = weisfeilerlehmankernel | |||||
| param_grid_precomputed = {'base_kernel': ['subtree'], | |||||
| 'height': np.linspace(0, 10, 11)} | |||||
| param_grid = {'C': np.logspace(-10, 4, num=29, base=10)} | |||||
| if param_grid is None: | |||||
| param_grid = {'C': np.logspace(-10, 10, num=41, base=10)} | |||||
| results = model_selection_for_precomputed_kernel( | |||||
| graphs, | |||||
| estimator, | |||||
| param_grid_precomputed, | |||||
| param_grid, | |||||
| 'classification', | |||||
| NUM_TRIALS=28, | |||||
| datafile_y=targets, | |||||
| extra_params=None, | |||||
| ds_name=ds_name, | |||||
| output_dir=output_dir, | |||||
| n_jobs=n_jobs, | |||||
| read_gm_from_file=False, | |||||
| verbose=True) | |||||
| return results[0], results[1] | return results[0], results[1] | ||||
| @@ -13,6 +13,7 @@ import os | |||||
| class Dataset(object): | class Dataset(object): | ||||
| def __init__(self, filename=None, filename_targets=None, **kwargs): | def __init__(self, filename=None, filename_targets=None, **kwargs): | ||||
| if filename is None: | if filename is None: | ||||
| self.__graphs = None | self.__graphs = None | ||||
| @@ -180,13 +181,13 @@ class Dataset(object): | |||||
| # return 0 | # return 0 | ||||
| def get_dataset_infos(self, keys=None): | |||||
| def get_dataset_infos(self, keys=None, params=None): | |||||
| """Computes and returns the structure and property information of the graph dataset. | """Computes and returns the structure and property information of the graph dataset. | ||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| keys : list | |||||
| List of strings which indicate which informations will be returned. The | |||||
| keys : list, optional | |||||
| A list of strings which indicate which informations will be returned. The | |||||
| possible choices includes: | possible choices includes: | ||||
| 'substructures': sub-structures graphs contains, including 'linear', 'non | 'substructures': sub-structures graphs contains, including 'linear', 'non | ||||
| @@ -241,7 +242,15 @@ class Dataset(object): | |||||
| 'class_number': number of classes. Only available for classification problems. | 'class_number': number of classes. Only available for classification problems. | ||||
| 'all_degree_entropy': the entropy of degree distribution of each graph. | |||||
| 'ave_degree_entropy': the average entropy of degree distribution of all graphs. | |||||
| All informations above will be returned if `keys` is not given. | All informations above will be returned if `keys` is not given. | ||||
| params: dict of dict, optional | |||||
| A dictinary which contains extra parameters for each possible | |||||
| element in ``keys``. | |||||
| Return | Return | ||||
| ------ | ------ | ||||
| @@ -276,6 +285,8 @@ class Dataset(object): | |||||
| 'node_attr_dim', | 'node_attr_dim', | ||||
| 'edge_attr_dim', | 'edge_attr_dim', | ||||
| 'class_number', | 'class_number', | ||||
| 'all_degree_entropy', | |||||
| 'ave_degree_entropy' | |||||
| ] | ] | ||||
| # dataset size | # dataset size | ||||
| @@ -420,6 +431,22 @@ class Dataset(object): | |||||
| self.__edge_attr_dim = self.__get_edge_attr_dim() | self.__edge_attr_dim = self.__get_edge_attr_dim() | ||||
| infos['edge_attr_dim'] = self.__edge_attr_dim | infos['edge_attr_dim'] = self.__edge_attr_dim | ||||
| # entropy of degree distribution. | |||||
| if 'all_degree_entropy' in keys: | |||||
| if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): | |||||
| base = params['all_degree_entropy']['base'] | |||||
| else: | |||||
| base = None | |||||
| infos['all_degree_entropy'] = self.__compute_all_degree_entropy(base=base) | |||||
| if 'ave_degree_entropy' in keys: | |||||
| if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): | |||||
| base = params['ave_degree_entropy']['base'] | |||||
| else: | |||||
| base = None | |||||
| infos['ave_degree_entropy'] = np.mean(self.__compute_all_degree_entropy(base=base)) | |||||
| return infos | return infos | ||||
| @@ -653,8 +680,7 @@ class Dataset(object): | |||||
| def __get_all_fill_factors(self): | def __get_all_fill_factors(self): | ||||
| """ | |||||
| Get fill factor, the number of non-zero entries in the adjacency matrix. | |||||
| """Get fill factor, the number of non-zero entries in the adjacency matrix. | |||||
| Returns | Returns | ||||
| ------- | ------- | ||||
| @@ -721,7 +747,30 @@ class Dataset(object): | |||||
| def __get_edge_attr_dim(self): | def __get_edge_attr_dim(self): | ||||
| return len(self.__edge_attrs) | return len(self.__edge_attrs) | ||||
| def __compute_all_degree_entropy(self, base=None): | |||||
| """Compute the entropy of degree distribution of each graph. | |||||
| Parameters | |||||
| ---------- | |||||
| base : float, optional | |||||
| The logarithmic base to use. The default is ``e`` (natural logarithm). | |||||
| Returns | |||||
| ------- | |||||
| degree_entropy : float | |||||
| The calculated entropy. | |||||
| """ | |||||
| from gklearn.utils.stats import entropy | |||||
| degree_entropy = [] | |||||
| for g in self.__graphs: | |||||
| degrees = list(dict(g.degree()).values()) | |||||
| en = entropy(degrees, base=base) | |||||
| degree_entropy.append(en) | |||||
| return degree_entropy | |||||
| @property | @property | ||||
| def graphs(self): | def graphs(self): | ||||
| @@ -0,0 +1,27 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Oct 5 15:12:41 2020 | |||||
| @author: ljia | |||||
| """ | |||||
| from collections import Counter | |||||
| from scipy import stats | |||||
| def entropy(labels, base=None): | |||||
| """Calculate the entropy of a distribution for given list of labels. | |||||
| Parameters | |||||
| ---------- | |||||
| labels : list | |||||
| Given list of labels. | |||||
| base : float, optional | |||||
| The logarithmic base to use. The default is ``e`` (natural logarithm). | |||||
| Returns | |||||
| ------- | |||||
| float | |||||
| The calculated entropy. | |||||
| """ | |||||
| return stats.entropy(list(Counter(labels).values()), base=base) | |||||
| @@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: | |||||
| setuptools.setup( | setuptools.setup( | ||||
| name="graphkit-learn", | name="graphkit-learn", | ||||
| version="0.2.0", | |||||
| version="0.2.1b1", | |||||
| author="Linlin Jia", | author="Linlin Jia", | ||||
| author_email="linlin.jia@insa-rouen.fr", | author_email="linlin.jia@insa-rouen.fr", | ||||
| description="A Python library for graph kernels, graph edit distances, and graph pre-images", | description="A Python library for graph kernels, graph edit distances, and graph pre-images", | ||||