| @@ -44,6 +44,8 @@ Simply clone this repository and voilà! Then check [`notebooks`](https://github | |||
| * The MinMax kernel | |||
| * Non-linear kernels | |||
| * The treelet kernel [10] | |||
| * Weisfeiler-Lehman kernel [11] | |||
| * Subtree | |||
| ## Computation optimization methods | |||
| @@ -92,6 +94,8 @@ Linlin Jia, Benoit Gaüzère, and Paul Honeine. Graph Kernels Based on Linear Pa | |||
| [10] Gaüzere, B., Brun, L., Villemin, D., 2012. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters 33, 2038–2047. | |||
| [11] Shervashidze, N., Schweitzer, P., Leeuwen, E.J.v., Mehlhorn, K., Borgwardt, K.M., 2011. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research 12, 2539–2561. | |||
| ## Authors | |||
| * [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie | |||
| @@ -0,0 +1,188 @@ | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| 1 | |||
| -1 | |||
| -1 | |||
| 1 | |||
| -1 | |||
| @@ -1,68 +0,0 @@ | |||
| === Introduction === | |||
| This folder contains 6 data sets of undirected labeled graphs in Matlab format for graph | |||
| classification: MUTAG, PTC, NCI1, NCI109, ENZYMES, and D&D. | |||
| === Usage === | |||
| For each data set X, the Matlab command | |||
| load X | |||
| loads into the memory a struct array containing graphs, and a column vector lx containing | |||
| a class label for each graph. | |||
| X(i).am is the adjacency matrix of the i'th graph, | |||
| X(i).al is the adjacency list of the i'th graph, | |||
| X(i).nl.values is a column vector of node labels for the i'th graph, | |||
| X(i).el (not always available) contains edge labels for the i'th graph. | |||
| Example: | |||
| typing "load MUTAG" in MATLAB | |||
| loads a 188 element array of graph structures, called MUTAG, and a column of 188 numbers, | |||
| each of which indicates the class that the corresponding graph belongs to. | |||
| === Description === | |||
| MUTAG (Debnath et al., 1991) is a data set of 188 mutagenic aromatic and heteroaromatic | |||
| nitro compounds labeled according to whether or not they have a mutagenic effect on the | |||
| Gram-negative bacterium Salmonella typhimurium. | |||
| PTC (Toivonen et al., 2003) contains 344 chemical compounds tested for carcinogenicity | |||
| in mice and rats. The classification task is to predict the carcinogenicity of compounds. | |||
| NCI1 and NCI109 represent two balanced subsets of data sets of chemical compounds screened | |||
| for activity against non-small cell lung cancer and ovarian cancer cell lines respectively | |||
| (Wale and Karypis (2006) and http://pubchem.ncbi.nlm.nih.gov). | |||
| ENZYMES is a data set of protein tertiary structures obtained from (Borgwardt et al., | |||
| 2005) consisting of 600 enzymes from the BRENDA enzyme database (Schomburg et al., 2004). | |||
| In this case the task is to correctly assign each enzyme to one of the 6 EC top-level | |||
| classes. | |||
| D&D is a data set of 1178 protein structures (Dobson and Doig, 2003). Each protein is | |||
| represented by a graph, in which the nodes are amino acids and two nodes are connected | |||
| by an edge if they are less than 6 Angstroms apart. The prediction task is to classify | |||
| the protein structures into enzymes and non-enzymes. | |||
| === References === | |||
| K. M. Borgwardt, C. S. Ong, S. Schoenauer, S. V. N. Vishwanathan, A. J. Smola, and H. P. | |||
| Kriegel. Protein function prediction via graph kernels. Bioinformatics, 21(Suppl 1):i47–i56, | |||
| Jun 2005. | |||
| A. K. Debnath, R. L. Lopez de Compadre, G. Debnath, A. J. Shusterman, and C. Hansch. | |||
| Structure-activity relationship of mutagenic aromatic and heteroaromatic nitro compounds. | |||
| Correlation with molecular orbital energies and hydrophobicity. J Med Chem, 34: 786–797, | |||
| 1991. | |||
| P. D. Dobson and A. J. Doig. Distinguishing enzyme structures from non-enzymes without | |||
| alignments. J Mol Biol, 330(4):771–783, Jul 2003. | |||
| I. Schomburg, A. Chang, C. Ebeling, M. Gremse, C. Heldt, G. Huhn, and D. Schomburg. Brenda, | |||
| the enzyme database: updates and major new developments. Nucleic Acids Research, 32D:431–433, | |||
| 2004. | |||
| H. Toivonen, A. Srinivasan, R.D. King, S. Kramer, and C. Helma (2003). Statistical | |||
| evaluation of the predictive toxicology challenge 2000-2001. Bioinformatics, 19(10):1183–1193. | |||
| N. Wale and G. Karypis. Comparison of descriptor spaces for chemical compound retrieval and | |||
| classification. In Proc. of ICDM, pages 678–689, Hong Kong, 2006. | |||
| @@ -0,0 +1,85 @@ | |||
| README for dataset MUTAG | |||
| === Usage === | |||
| This folder contains the following comma separated text files | |||
| (replace DS by the name of the dataset): | |||
| n = total number of nodes | |||
| m = total number of edges | |||
| N = number of graphs | |||
| (1) DS_A.txt (m lines) | |||
| sparse (block diagonal) adjacency matrix for all graphs, | |||
| each line corresponds to (row, col) resp. (node_id, node_id) | |||
| (2) DS_graph_indicator.txt (n lines) | |||
| column vector of graph identifiers for all nodes of all graphs, | |||
| the value in the i-th line is the graph_id of the node with node_id i | |||
| (3) DS_graph_labels.txt (N lines) | |||
| class labels for all graphs in the dataset, | |||
| the value in the i-th line is the class label of the graph with graph_id i | |||
| (4) DS_node_labels.txt (n lines) | |||
| column vector of node labels, | |||
| the value in the i-th line corresponds to the node with node_id i | |||
| There are OPTIONAL files if the respective information is available: | |||
| (5) DS_edge_labels.txt (m lines; same size as DS_A_sparse.txt) | |||
| labels for the edges in DD_A_sparse.txt | |||
| (6) DS_edge_attributes.txt (m lines; same size as DS_A.txt) | |||
| attributes for the edges in DS_A.txt | |||
| (7) DS_node_attributes.txt (n lines) | |||
| matrix of node attributes, | |||
| the comma seperated values in the i-th line is the attribute vector of the node with node_id i | |||
| (8) DS_graph_attributes.txt (N lines) | |||
| regression values for all graphs in the dataset, | |||
| the value in the i-th line is the attribute of the graph with graph_id i | |||
| === Description of the dataset === | |||
| The MUTAG dataset consists of 188 chemical compounds divided into two | |||
| classes according to their mutagenic effect on a bacterium. | |||
| The chemical data was obtained form http://cdb.ics.uci.edu and converted | |||
| to graphs, where vertices represent atoms and edges represent chemical | |||
| bonds. Explicit hydrogen atoms have been removed and vertices are labeled | |||
| by atom type and edges by bond type (single, double, triple or aromatic). | |||
| Chemical data was processed using the Chemistry Development Kit (v1.4). | |||
| Node labels: | |||
| 0 C | |||
| 1 N | |||
| 2 O | |||
| 3 F | |||
| 4 I | |||
| 5 Cl | |||
| 6 Br | |||
| Edge labels: | |||
| 0 aromatic | |||
| 1 single | |||
| 2 double | |||
| 3 triple | |||
| === Previous Use of the Dataset === | |||
| Kriege, N., Mutzel, P.: Subgraph matching kernels for attributed graphs. In: Proceedings | |||
| of the 29th International Conference on Machine Learning (ICML-2012) (2012). | |||
| === References === | |||
| Debnath, A.K., Lopez de Compadre, R.L., Debnath, G., Shusterman, A.J., and Hansch, C. | |||
| Structure-activity relationship of mutagenic aromatic and heteroaromatic nitro compounds. | |||
| Correlation with molecular orbital energies and hydrophobicity. J. Med. Chem. 34(2):786-797 (1991). | |||
| @@ -73,20 +73,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -15,20 +15,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -82,4 +80,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -104,20 +104,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -15,20 +15,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -219,20 +219,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -20,20 +20,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -65,6 +63,7 @@ dslist = [ | |||
| estimator = randomwalkkernel | |||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
| gaussiankernel = functools.partial(gaussiankernel, gamma=0.5) | |||
| for ds in dslist: | |||
| print() | |||
| @@ -108,4 +107,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -171,21 +171,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "#\n", | |||
| "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| "# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -11,21 +11,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -79,4 +76,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -124,20 +124,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -17,20 +17,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -86,4 +84,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -100,20 +100,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| "# # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -1,7 +1,7 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Oct 5 19:19:33 2018 | |||
| Created on Mon Mar 21 11:19:33 2019 | |||
| @author: ljia | |||
| """ | |||
| @@ -10,26 +10,24 @@ from libs import * | |||
| import multiprocessing | |||
| from pygraph.kernels.treeletKernel import treeletkernel | |||
| from pygraph.utils.kernels import gaussiankernel, polynomialkernel | |||
| from pygraph.utils.kernels import gaussiankernel, linearkernel, polynomialkernel | |||
| dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -59,7 +57,7 @@ dslist = [ | |||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| ] | |||
| estimator = treeletkernel | |||
| param_grid_precomputed = {'sub_kernel': [gaussiankernel, polynomialkernel]} | |||
| param_grid_precomputed = {'sub_kernel': [gaussiankernel, linearkernel, polynomialkernel]} | |||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
| @@ -80,4 +78,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -227,13 +227,7 @@ | |||
| "the gram matrix with parameters {'compute_method': 'trie', 'depth': 1.0, 'k_func': 'tanimoto', 'n_jobs': 8, 'verbose': True} is: \n", | |||
| "\n", | |||
| "\n", | |||
| "getting paths: 150it [00:00, 27568.71it/s]\n" | |||
| ] | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "getting paths: 150it [00:00, 27568.71it/s]\n", | |||
| "calculating kernels: 11325it [00:00, 780628.98it/s]\n", | |||
| "\n", | |||
| " --- kernel matrix of path kernel up to 2 of size 150 built in 0.2590019702911377 seconds ---\n", | |||
| @@ -265,20 +259,18 @@ | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| @@ -15,20 +15,18 @@ dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| @@ -81,4 +79,4 @@ for ds in dslist: | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| print() | |||
| @@ -0,0 +1,144 @@ | |||
| { | |||
| "cells": [ | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "\n", | |||
| "MUTAG\n", | |||
| "\n", | |||
| "--- This is a classification problem ---\n", | |||
| "\n", | |||
| "\n", | |||
| "1. Loading dataset from file...\n", | |||
| "\n", | |||
| "2. Calculating gram matrices. This could take a while...\n", | |||
| "\n", | |||
| " --- Weisfeiler-Lehman subtree kernel matrix of size 188 built in 0.14636015892028809 seconds ---\n", | |||
| "\n", | |||
| "the gram matrix with parameters {'base_kernel': 'subtree', 'height': 0.0, 'n_jobs': 8, 'verbose': True} is: \n", | |||
| "\n", | |||
| "\n", | |||
| "\n", | |||
| " --- Weisfeiler-Lehman subtree kernel matrix of size 188 built in 0.2917311191558838 seconds ---\n", | |||
| "\n", | |||
| "the gram matrix with parameters {'base_kernel': 'subtree', 'height': 1.0, 'n_jobs': 8, 'verbose': True} is: \n", | |||
| "\n", | |||
| "\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "#!/usr/bin/env python3\n", | |||
| "# -*- coding: utf-8 -*-\n", | |||
| "\"\"\"\n", | |||
| "Created on Mon Mar 21 11:19:33 2019\n", | |||
| "\n", | |||
| "@author: ljia\n", | |||
| "\"\"\"\n", | |||
| "\n", | |||
| "from libs import *\n", | |||
| "import multiprocessing\n", | |||
| "\n", | |||
| "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n", | |||
| "from pygraph.utils.kernels import gaussiankernel, polynomialkernel\n", | |||
| "\n", | |||
| "\n", | |||
| "dslist = [\n", | |||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
| " 'task': 'regression'}, # node symb\n", | |||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||
| " # contains single node graph, node symb\n", | |||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
| " # node nsymb\n", | |||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
| " # node symb/nsymb\n", | |||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
| "# # node/edge symb\n", | |||
| " {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||
| "\n", | |||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| " # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||
| " # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||
| " #\n", | |||
| " # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||
| " # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||
| " # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||
| " # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||
| " # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||
| "\n", | |||
| " # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||
| " # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||
| "# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||
| " {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
| " {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||
| " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
| " # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||
| " # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||
| "\n", | |||
| " # # not working below\n", | |||
| " # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||
| " # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||
| " # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||
| " # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||
| "]\n", | |||
| "estimator = weisfeilerlehmankernel\n", | |||
| "param_grid_precomputed = {'base_kernel': ['subtree'], \n", | |||
| " 'height': np.linspace(0, 10, 11)}\n", | |||
| "param_grid = [{'C': np.logspace(-10, 4, num=29, base=10)},\n", | |||
| " {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n", | |||
| "\n", | |||
| "for ds in dslist:\n", | |||
| " print()\n", | |||
| " print(ds['name'])\n", | |||
| " model_selection_for_precomputed_kernel(\n", | |||
| " ds['dataset'],\n", | |||
| " estimator,\n", | |||
| " param_grid_precomputed,\n", | |||
| " (param_grid[1] if ('task' in ds and ds['task']\n", | |||
| " == 'regression') else param_grid[0]),\n", | |||
| " (ds['task'] if 'task' in ds else 'classification'),\n", | |||
| " NUM_TRIALS=30,\n", | |||
| " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | |||
| " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | |||
| " ds_name=ds['name'],\n", | |||
| " n_jobs=multiprocessing.cpu_count(),\n", | |||
| " read_gm_from_file=False,\n", | |||
| " verbose=True)\n", | |||
| " print()" | |||
| ] | |||
| } | |||
| ], | |||
| "metadata": { | |||
| "kernelspec": { | |||
| "display_name": "Python 3", | |||
| "language": "python", | |||
| "name": "python3" | |||
| }, | |||
| "language_info": { | |||
| "codemirror_mode": { | |||
| "name": "ipython", | |||
| "version": 3 | |||
| }, | |||
| "file_extension": ".py", | |||
| "mimetype": "text/x-python", | |||
| "name": "python", | |||
| "nbconvert_exporter": "python", | |||
| "pygments_lexer": "ipython3", | |||
| "version": "3.6.7" | |||
| } | |||
| }, | |||
| "nbformat": 4, | |||
| "nbformat_minor": 2 | |||
| } | |||
| @@ -0,0 +1,81 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Mon Mar 21 11:19:33 2019 | |||
| @author: ljia | |||
| """ | |||
| from libs import * | |||
| import multiprocessing | |||
| from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
| from pygraph.utils.kernels import gaussiankernel, polynomialkernel | |||
| dslist = [ | |||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
| 'task': 'regression'}, # node symb | |||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||
| # contains single node graph, node symb | |||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| # node nsymb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # node symb/nsymb | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # | |||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
| # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
| # | |||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
| # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
| # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
| # # not working below | |||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
| # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| ] | |||
| estimator = weisfeilerlehmankernel | |||
| param_grid_precomputed = {'base_kernel': ['subtree'], | |||
| 'height': np.linspace(0, 10, 11)} | |||
| param_grid = [{'C': np.logspace(-10, 4, num=29, base=10)}, | |||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
| for ds in dslist: | |||
| print() | |||
| print(ds['name']) | |||
| model_selection_for_precomputed_kernel( | |||
| ds['dataset'], | |||
| estimator, | |||
| param_grid_precomputed, | |||
| (param_grid[1] if ('task' in ds and ds['task'] | |||
| == 'regression') else param_grid[0]), | |||
| (ds['task'] if 'task' in ds else 'classification'), | |||
| NUM_TRIALS=30, | |||
| datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
| ds_name=ds['name'], | |||
| n_jobs=multiprocessing.cpu_count(), | |||
| read_gm_from_file=False, | |||
| verbose=True) | |||
| print() | |||
| @@ -16,7 +16,7 @@ import librariesImport, script | |||
| sys.path.insert(0, "../") | |||
| from pygraph.utils.graphfiles import saveDataset | |||
| from pygraph.utils.graphdataset import get_dataset_attributes | |||
| from pygraph.utils.utils import graph_isIdentical | |||
| from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels | |||
| #from pygraph.utils.utils import graph_deepcopy | |||
| @@ -158,9 +158,9 @@ def GED(g1, g2, lib='gedlib'): | |||
| script.PyRestartEnv() | |||
| script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') | |||
| listID = script.PyGetGraphIds() | |||
| script.PySetEditCost("CHEM_2") | |||
| script.PySetEditCost("CHEM_1") | |||
| script.PyInitEnv() | |||
| script.PySetMethod("BIPARTITE", "") | |||
| script.PySetMethod("IPFP", "") | |||
| script.PyInitMethod() | |||
| g = listID[0] | |||
| h = listID[1] | |||
| @@ -173,20 +173,6 @@ def GED(g1, g2, lib='gedlib'): | |||
| return dis, pi_forward, pi_backward | |||
| def get_node_labels(Gn, node_label): | |||
| nl = set() | |||
| for G in Gn: | |||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
| return nl | |||
| def get_edge_labels(Gn, edge_label): | |||
| el = set() | |||
| for G in Gn: | |||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
| return el | |||
| # --------------------------- These are tests --------------------------------# | |||
| def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, | |||
| @@ -65,6 +65,7 @@ def marginalizedkernel(*args, | |||
| # pre-process | |||
| n_iteration = int(n_iteration) | |||
| Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||
| Gn = [g.copy() for g in Gn] | |||
| ds_attrs = get_dataset_attributes( | |||
| Gn, | |||
| @@ -215,37 +216,37 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||
| R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||
| # initial R_inf, the 1st iteration. | |||
| for node1 in g1.nodes(data=True): | |||
| for node2 in g2.nodes(data=True): | |||
| for node1 in g1.nodes(): | |||
| for node2 in g2.nodes(): | |||
| # R_inf[(node1[0], node2[0])] = r1 | |||
| if len(g1[node1[0]]) > 0: | |||
| if len(g2[node2[0]]) > 0: | |||
| R_inf[(node1[0], node2[0])] = r1 | |||
| if len(g1[node1]) > 0: | |||
| if len(g2[node2]) > 0: | |||
| R_inf[(node1, node2)] = r1 | |||
| else: | |||
| R_inf[(node1[0], node2[0])] = p_quit | |||
| R_inf[(node1, node2)] = p_quit | |||
| else: | |||
| if len(g2[node2[0]]) > 0: | |||
| R_inf[(node1[0], node2[0])] = p_quit | |||
| if len(g2[node2]) > 0: | |||
| R_inf[(node1, node2)] = p_quit | |||
| else: | |||
| R_inf[(node1[0], node2[0])] = 1 | |||
| R_inf[(node1, node2)] = 1 | |||
| # compute all transition probability first. | |||
| t_dict = {} | |||
| if n_iteration > 1: | |||
| for node1 in g1.nodes(data=True): | |||
| neighbor_n1 = g1[node1[0]] | |||
| for node1 in g1.nodes(): | |||
| neighbor_n1 = g1[node1] | |||
| # the transition probability distribution in the random walks | |||
| # generating step (uniform distribution over the vertices adjacent | |||
| # to the current vertex) | |||
| if len(neighbor_n1) > 0: | |||
| p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
| for node2 in g2.nodes(data=True): | |||
| neighbor_n2 = g2[node2[0]] | |||
| for node2 in g2.nodes(): | |||
| neighbor_n2 = g2[node2] | |||
| if len(neighbor_n2) > 0: | |||
| p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
| for neighbor1 in neighbor_n1: | |||
| for neighbor2 in neighbor_n2: | |||
| t_dict[(node1[0], node2[0], neighbor1, neighbor2)] = \ | |||
| t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||
| p_trans_n1 * p_trans_n2 * \ | |||
| deltakernel(g1.node[neighbor1][node_label], | |||
| g2.node[neighbor2][node_label]) * \ | |||
| @@ -258,20 +259,20 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||
| R_inf_old = R_inf.copy() | |||
| # calculate R_inf for each pair of nodes | |||
| for node1 in g1.nodes(data=True): | |||
| neighbor_n1 = g1[node1[0]] | |||
| for node1 in g1.nodes(): | |||
| neighbor_n1 = g1[node1] | |||
| # the transition probability distribution in the random walks | |||
| # generating step (uniform distribution over the vertices adjacent | |||
| # to the current vertex) | |||
| if len(neighbor_n1) > 0: | |||
| for node2 in g2.nodes(data=True): | |||
| neighbor_n2 = g2[node2[0]] | |||
| for node2 in g2.nodes(): | |||
| neighbor_n2 = g2[node2] | |||
| if len(neighbor_n2) > 0: | |||
| R_inf[(node1[0], node2[0])] = r1 | |||
| R_inf[(node1, node2)] = r1 | |||
| for neighbor1 in neighbor_n1: | |||
| for neighbor2 in neighbor_n2: | |||
| R_inf[(node1[0], node2[0])] += \ | |||
| (t_dict[(node1[0], node2[0], neighbor1, neighbor2)] * \ | |||
| R_inf[(node1, node2)] += \ | |||
| (t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||
| R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||
| # add elements of R_inf up and calculate kernel | |||
| @@ -58,6 +58,7 @@ def randomwalkkernel(*args, | |||
| """ | |||
| compute_method = compute_method.lower() | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| eweight = None | |||
| if edge_weight == None: | |||
| @@ -54,6 +54,7 @@ def spkernel(*args, | |||
| """ | |||
| # pre-process | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| weight = None | |||
| if edge_weight is None: | |||
| if verbose: | |||
| @@ -74,6 +74,7 @@ def structuralspkernel(*args, | |||
| """ | |||
| # pre-process | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| weight = None | |||
| if edge_weight is None: | |||
| if verbose: | |||
| @@ -1,6 +1,8 @@ | |||
| """ | |||
| @author: linlin | |||
| @references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||
| @references: | |||
| [1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in | |||
| chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||
| """ | |||
| import sys | |||
| @@ -50,6 +52,7 @@ def treeletkernel(*args, | |||
| """ | |||
| # pre-process | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ds_attrs = get_dataset_attributes(Gn, | |||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||
| @@ -76,13 +79,13 @@ def treeletkernel(*args, | |||
| else: | |||
| chunksize = 100 | |||
| canonkeys = [[] for _ in range(len(Gn))] | |||
| getps_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||
| get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||
| labeled, ds_attrs['is_directed']) | |||
| if verbose: | |||
| iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), | |||
| iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), | |||
| desc='getting canonkeys', file=sys.stdout) | |||
| else: | |||
| iterator = pool.imap_unordered(getps_partial, itr, chunksize) | |||
| iterator = pool.imap_unordered(get_partial, itr, chunksize) | |||
| for i, ck in iterator: | |||
| canonkeys[i] = ck | |||
| pool.close() | |||
| @@ -1,382 +0,0 @@ | |||
| """ | |||
| @author: linlin | |||
| @references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||
| """ | |||
| import sys | |||
| import pathlib | |||
| sys.path.insert(0, "../") | |||
| import time | |||
| from collections import Counter | |||
| from itertools import chain | |||
| import networkx as nx | |||
| import numpy as np | |||
| def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||
| """Calculate treelet graph kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| / | |||
| G1, G2 : NetworkX graphs | |||
| 2 graphs between which the kernel is calculated. | |||
| node_label : string | |||
| node attribute used as label. The default node label is atom. | |||
| edge_label : string | |||
| edge attribute used as label. The default edge label is bond_type. | |||
| labeled : boolean | |||
| Whether the graphs are labeled. The default is True. | |||
| Return | |||
| ------ | |||
| Kmatrix/kernel : Numpy matrix/float | |||
| Kernel matrix, each element of which is the treelet kernel between 2 praphs. / Treelet kernel between 2 graphs. | |||
| """ | |||
| if len(args) == 1: # for a list of graphs | |||
| Gn = args[0] | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| start_time = time.time() | |||
| # get all canonical keys of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||
| canonkeys = [ get_canonkeys(Gn[i], node_label = node_label, edge_label = edge_label, labeled = labeled) \ | |||
| for i in range(0, len(Gn)) ] | |||
| for i in range(0, len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| run_time = time.time() - start_time | |||
| print("\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) | |||
| return Kmatrix, run_time | |||
| else: # for only 2 graphs | |||
| start_time = time.time() | |||
| canonkey1 = get_canonkeys(args[0], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||
| canonkey2 = get_canonkeys(args[1], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||
| kernel = _treeletkernel_do(canonkey1, canonkey2, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||
| run_time = time.time() - start_time | |||
| print("\n --- treelet kernel built in %s seconds ---" % (run_time)) | |||
| return kernel, run_time | |||
| def _treeletkernel_do(canonkey1, canonkey2, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||
| """Calculate treelet graph kernel between 2 graphs. | |||
| Parameters | |||
| ---------- | |||
| canonkey1, canonkey2 : list | |||
| List of canonical keys in 2 graphs, where each key is represented by a string. | |||
| node_label : string | |||
| Node attribute used as label. The default node label is atom. | |||
| edge_label : string | |||
| Edge attribute used as label. The default edge label is bond_type. | |||
| labeled : boolean | |||
| Whether the graphs are labeled. The default is True. | |||
| Return | |||
| ------ | |||
| kernel : float | |||
| Treelet Kernel between 2 graphs. | |||
| """ | |||
| keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||
| vector1 = np.array([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) | |||
| vector2 = np.array([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) | |||
| kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) | |||
| return kernel | |||
| def get_canonkeys(G, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||
| """Generate canonical keys of all treelets in a graph. | |||
| Parameters | |||
| ---------- | |||
| G : NetworkX graphs | |||
| The graph in which keys are generated. | |||
| node_label : string | |||
| node attribute used as label. The default node label is atom. | |||
| edge_label : string | |||
| edge attribute used as label. The default edge label is bond_type. | |||
| labeled : boolean | |||
| Whether the graphs are labeled. The default is True. | |||
| Return | |||
| ------ | |||
| canonkey/canonkey_l : dict | |||
| For unlabeled graphs, canonkey is a dictionary which records amount of every tree pattern. For labeled graphs, canonkey_l is one which keeps track of amount of every treelet. | |||
| """ | |||
| patterns = {} # a dictionary which consists of lists of patterns for all graphlet. | |||
| canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. | |||
| ### structural analysis ### | |||
| ### In this section, a list of patterns is generated for each graphlet, where every pattern is represented by nodes ordered by | |||
| ### Morgan's extended labeling. | |||
| # linear patterns | |||
| patterns['0'] = G.nodes() | |||
| canonkey['0'] = nx.number_of_nodes(G) | |||
| for i in range(1, 6): # for i in range(1, 6): | |||
| patterns[str(i)] = find_all_paths(G, i) | |||
| canonkey[str(i)] = len(patterns[str(i)]) | |||
| # n-star patterns | |||
| patterns['3star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3 ] | |||
| patterns['4star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4 ] | |||
| patterns['5star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5 ] | |||
| # n-star patterns | |||
| canonkey['6'] = len(patterns['3star']) | |||
| canonkey['8'] = len(patterns['4star']) | |||
| canonkey['d'] = len(patterns['5star']) | |||
| # pattern 7 | |||
| patterns['7'] = [] # the 1st line of Table 1 in Ref [1] | |||
| for pattern in patterns['3star']: | |||
| for i in range(1, len(pattern)): # for each neighbor of node 0 | |||
| if G.degree(pattern[i]) >= 2: | |||
| pattern_t = pattern[:] | |||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] # set the node with degree >= 2 as the 4th node | |||
| for neighborx in G[pattern[i]]: | |||
| if neighborx != pattern[0]: | |||
| new_pattern = pattern_t + [ neighborx ] | |||
| patterns['7'].append(new_pattern) | |||
| canonkey['7'] = len(patterns['7']) | |||
| # pattern 11 | |||
| patterns['11'] = [] # the 4th line of Table 1 in Ref [1] | |||
| for pattern in patterns['4star']: | |||
| for i in range(1, len(pattern)): | |||
| if G.degree(pattern[i]) >= 2: | |||
| pattern_t = pattern[:] | |||
| pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i] | |||
| for neighborx in G[pattern[i]]: | |||
| if neighborx != pattern[0]: | |||
| new_pattern = pattern_t + [ neighborx ] | |||
| patterns['11'].append(new_pattern) | |||
| canonkey['b'] = len(patterns['11']) | |||
| # pattern 12 | |||
| patterns['12'] = [] # the 5th line of Table 1 in Ref [1] | |||
| rootlist = [] # a list of root nodes, whose extended labels are 3 | |||
| for pattern in patterns['3star']: | |||
| if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes | |||
| rootlist.append(pattern[0]) | |||
| for i in range(1, len(pattern)): | |||
| if G.degree(pattern[i]) >= 3: | |||
| rootlist.append(pattern[i]) | |||
| pattern_t = pattern[:] | |||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||
| for neighborx1 in G[pattern[i]]: | |||
| if neighborx1 != pattern[0]: | |||
| for neighborx2 in G[pattern[i]]: | |||
| if neighborx1 > neighborx2 and neighborx2 != pattern[0]: | |||
| new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||
| # new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] | |||
| patterns['12'].append(new_pattern) | |||
| canonkey['c'] = int(len(patterns['12']) / 2) | |||
| # pattern 9 | |||
| patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] | |||
| for pattern in patterns['3star']: | |||
| for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \ | |||
| for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]: | |||
| pattern_t = pattern[:] | |||
| # move nodes with extended labels 4 to specific position to correspond to their children | |||
| pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])] | |||
| pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])] | |||
| for neighborx1 in G[pairs[0]]: | |||
| if neighborx1 != pattern[0]: | |||
| for neighborx2 in G[pairs[1]]: | |||
| if neighborx2 != pattern[0]: | |||
| new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||
| patterns['9'].append(new_pattern) | |||
| canonkey['9'] = len(patterns['9']) | |||
| # pattern 10 | |||
| patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] | |||
| for pattern in patterns['3star']: | |||
| for i in range(1, len(pattern)): | |||
| if G.degree(pattern[i]) >= 2: | |||
| for neighborx in G[pattern[i]]: | |||
| if neighborx != pattern[0] and G.degree(neighborx) >= 2: | |||
| pattern_t = pattern[:] | |||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||
| new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] | |||
| patterns['10'].extend(new_patterns) | |||
| canonkey['a'] = len(patterns['10']) | |||
| ### labeling information ### | |||
| ### In this section, a list of canonical keys is generated for every pattern obtained in the structural analysis | |||
| ### section above, which is a string corresponding to a unique treelet. A dictionary is built to keep track of | |||
| ### the amount of every treelet. | |||
| if labeled == True: | |||
| canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | |||
| # linear patterns | |||
| canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) | |||
| for key in canonkey_t: | |||
| canonkey_l['0' + key] = canonkey_t[key] | |||
| for i in range(1, 6): # for i in range(1, 6): | |||
| treelet = [] | |||
| for pattern in patterns[str(i)]: | |||
| canonlist = list(chain.from_iterable((G.node[node][node_label], \ | |||
| G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) | |||
| canonlist.append(G.node[pattern[-1]][node_label]) | |||
| canonkey_t = ''.join(canonlist) | |||
| canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] | |||
| treelet.append(str(i) + canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # n-star patterns | |||
| for i in range(3, 6): | |||
| treelet = [] | |||
| for pattern in patterns[str(i) + 'star']: | |||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ] | |||
| canonlist.sort() | |||
| canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist) | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 7 | |||
| treelet = [] | |||
| for pattern in patterns['7']: | |||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||
| canonlist.sort() | |||
| canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||
| + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 11 | |||
| treelet = [] | |||
| for pattern in patterns['11']: | |||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ] | |||
| canonlist.sort() | |||
| canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \ | |||
| + G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 10 | |||
| treelet = [] | |||
| for pattern in patterns['10']: | |||
| canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||
| canonlist.sort() | |||
| canonkey0 = ''.join(canonlist) | |||
| canonkey_t = 'a' + G.node[pattern[3]][node_label] \ | |||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \ | |||
| + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||
| + canonkey4 + canonkey0 | |||
| treelet.append(canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 12 | |||
| treelet = [] | |||
| for pattern in patterns['12']: | |||
| canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||
| canonlist0.sort() | |||
| canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ] | |||
| canonlist3.sort() | |||
| # 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order. | |||
| canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \ | |||
| + ''.join(canonlist0) \ | |||
| + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||
| + ''.join(canonlist3) | |||
| canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \ | |||
| + ''.join(canonlist3) \ | |||
| + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||
| + ''.join(canonlist0) | |||
| treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | |||
| canonkey_l.update(Counter(treelet)) | |||
| # pattern 9 | |||
| treelet = [] | |||
| for pattern in patterns['9']: | |||
| canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label] | |||
| canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label] | |||
| prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label] | |||
| prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] | |||
| if prekey2 + canonkey2 < prekey3 + canonkey3: | |||
| canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||
| + prekey2 + prekey3 + canonkey2 + canonkey3 | |||
| else: | |||
| canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||
| + prekey3 + prekey2 + canonkey3 + canonkey2 | |||
| treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t) | |||
| canonkey_l.update(Counter(treelet)) | |||
| return canonkey_l | |||
| return canonkey | |||
| def find_paths(G, source_node, length): | |||
| """Find all paths with a certain length those start from a source node. A recursive depth first search is applied. | |||
| Parameters | |||
| ---------- | |||
| G : NetworkX graphs | |||
| The graph in which paths are searched. | |||
| source_node : integer | |||
| The number of the node from where all paths start. | |||
| length : integer | |||
| The length of paths. | |||
| Return | |||
| ------ | |||
| path : list of list | |||
| List of paths retrieved, where each path is represented by a list of nodes. | |||
| """ | |||
| if length == 0: | |||
| return [[source_node]] | |||
| path = [ [source_node] + path for neighbor in G[source_node] \ | |||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path ] | |||
| return path | |||
| def find_all_paths(G, length): | |||
| """Find all paths with a certain length in a graph. A recursive depth first search is applied. | |||
| Parameters | |||
| ---------- | |||
| G : NetworkX graphs | |||
| The graph in which paths are searched. | |||
| length : integer | |||
| The length of paths. | |||
| Return | |||
| ------ | |||
| path : list of list | |||
| List of paths retrieved, where each path is represented by a list of nodes. | |||
| """ | |||
| all_paths = [] | |||
| for node in G: | |||
| all_paths.extend(find_paths(G, node, length)) | |||
| all_paths_r = [ path[::-1] for path in all_paths ] | |||
| # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||
| for idx, path in enumerate(all_paths[:-1]): | |||
| for path2 in all_paths_r[idx+1::]: | |||
| if path == path2: | |||
| all_paths[idx] = [] | |||
| break | |||
| return list(filter(lambda a: a != [], all_paths)) | |||
| @@ -60,6 +60,7 @@ def untilhpathkernel(*args, | |||
| # pre-process | |||
| depth = int(depth) | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
| Gn = [g.copy() for g in Gn] | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| ds_attrs = get_dataset_attributes( | |||
| Gn, | |||
| @@ -0,0 +1,549 @@ | |||
| """ | |||
| @author: linlin | |||
| @references: | |||
| [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. | |||
| Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. | |||
| 2011;12(Sep):2539-61. | |||
| """ | |||
| import sys | |||
| from collections import Counter | |||
| sys.path.insert(0, "../") | |||
| from functools import partial | |||
| import time | |||
| #from multiprocessing import Pool | |||
| from tqdm import tqdm | |||
| import networkx as nx | |||
| import numpy as np | |||
| #from pygraph.kernels.pathKernel import pathkernel | |||
| from pygraph.utils.graphdataset import get_dataset_attributes | |||
| from pygraph.utils.parallel import parallel_gm | |||
| # @todo: support edge kernel, sp kernel, user-defined kernel. | |||
| def weisfeilerlehmankernel(*args, | |||
| node_label='atom', | |||
| edge_label='bond_type', | |||
| height=0, | |||
| base_kernel='subtree', | |||
| parallel=None, | |||
| n_jobs=None, | |||
| verbose=True): | |||
| """Calculate Weisfeiler-Lehman kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| / | |||
| G1, G2 : NetworkX graphs | |||
| 2 graphs between which the kernel is calculated. | |||
| node_label : string | |||
| node attribute used as label. The default node label is atom. | |||
| edge_label : string | |||
| edge attribute used as label. The default edge label is bond_type. | |||
| height : int | |||
| subtree height | |||
| base_kernel : string | |||
| base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| Notes | |||
| ----- | |||
| This function now supports WL subtree kernel only. | |||
| """ | |||
| # pre-process | |||
| base_kernel = base_kernel.lower() | |||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list | |||
| Gn = [g.copy() for g in Gn] | |||
| ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], | |||
| node_label=node_label) | |||
| if not ds_attrs['node_labeled']: | |||
| for G in Gn: | |||
| nx.set_node_attributes(G, '0', 'atom') | |||
| start_time = time.time() | |||
| # for WL subtree kernel | |||
| if base_kernel == 'subtree': | |||
| Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||
| # for WL shortest path kernel | |||
| elif base_kernel == 'sp': | |||
| Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height) | |||
| # for WL edge kernel | |||
| elif base_kernel == 'edge': | |||
| Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height) | |||
| # for user defined base kernel | |||
| else: | |||
| Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel) | |||
| run_time = time.time() - start_time | |||
| if verbose: | |||
| print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" | |||
| % (base_kernel, len(args[0]), run_time)) | |||
| return Kmatrix, run_time | |||
| def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose): | |||
| """Calculate Weisfeiler-Lehman kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| node_label : string | |||
| node attribute used as label. | |||
| edge_label : string | |||
| edge attribute used as label. | |||
| height : int | |||
| wl height. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| height = int(height) | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
| # initial for height = 0 | |||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
| # for each graph | |||
| for G in Gn: | |||
| # get the set of original labels | |||
| labels_ori = list(nx.get_node_attributes(G, node_label).values()) | |||
| # number of occurence of each label in G | |||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
| # calculate subtree kernel with the 0th iteration and add it to the final kernel | |||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||
| # iterate each height | |||
| for h in range(1, height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||
| all_num_of_each_label = [] # number of occurence of each label in G | |||
| # # for each graph | |||
| # # ---- use pool.imap_unordered to parallel and track progress. ---- | |||
| # pool = Pool(n_jobs) | |||
| # itr = zip(Gn, range(0, len(Gn))) | |||
| # if len(Gn) < 100 * n_jobs: | |||
| # chunksize = int(len(Gn) / n_jobs) + 1 | |||
| # else: | |||
| # chunksize = 100 | |||
| # all_multisets_list = [[] for _ in range(len(Gn))] | |||
| ## set_unique_list = [[] for _ in range(len(Gn))] | |||
| # get_partial = partial(wrapper_wl_iteration, node_label) | |||
| ## if verbose: | |||
| ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), | |||
| ## desc='wl iteration', file=sys.stdout) | |||
| ## else: | |||
| # iterator = pool.imap_unordered(get_partial, itr, chunksize) | |||
| # for i, all_multisets in iterator: | |||
| # all_multisets_list[i] = all_multisets | |||
| ## set_unique_list[i] = set_unique | |||
| ## all_set_unique = all_set_unique | set(set_unique) | |||
| # pool.close() | |||
| # pool.join() | |||
| # all_set_unique = set() | |||
| # for uset in all_multisets_list: | |||
| # all_set_unique = all_set_unique | set(uset) | |||
| # | |||
| # all_set_unique = list(all_set_unique) | |||
| ## # a dictionary mapping original labels to new ones. | |||
| ## set_compressed = {} | |||
| ## for idx, uset in enumerate(all_set_unique): | |||
| ## set_compressed.update({uset: idx}) | |||
| # | |||
| # for ig, G in enumerate(Gn): | |||
| # | |||
| ## # a dictionary mapping original labels to new ones. | |||
| ## set_compressed = {} | |||
| ## # if a label occured before, assign its former compressed label, | |||
| ## # else assign the number of labels occured + 1 as the compressed label. | |||
| ## for value in set_unique_list[i]: | |||
| ## if uset in all_set_unique: | |||
| ## set_compressed.update({uset: all_set_compressed[value]}) | |||
| ## else: | |||
| ## set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
| ## num_of_labels_occured += 1 | |||
| # | |||
| ## all_set_compressed.update(set_compressed) | |||
| # | |||
| # # relabel nodes | |||
| # for idx, node in enumerate(G.nodes()): | |||
| # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx]) | |||
| # | |||
| # # get the set of compressed labels | |||
| # labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||
| ## all_labels_ori.update(labels_comp) | |||
| # all_num_of_each_label[ig] = dict(Counter(labels_comp)) | |||
| # all_set_unique = list(all_set_unique) | |||
| # @todo: parallel this part. | |||
| for idx, G in enumerate(Gn): | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs[node_label]] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # label compression | |||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # if a label occured before, assign its former compressed label, | |||
| # else assign the number of labels occured + 1 as the compressed label. | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({value: all_set_compressed[value]}) | |||
| else: | |||
| set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for idx, node in enumerate(G.nodes()): | |||
| G.nodes[node][node_label] = set_compressed[all_multisets[idx]] | |||
| # get the set of compressed labels | |||
| labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||
| # all_labels_ori.update(labels_comp) | |||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||
| return Kmatrix | |||
| def wl_iteration(G, node_label): | |||
| all_multisets = [] | |||
| for node, attrs in G.nodes(data=True): | |||
| # Multiset-label determination. | |||
| multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = [attrs[node_label]] + multiset # add the prefix | |||
| all_multisets.append(tuple(multiset)) | |||
| # # label compression | |||
| # set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
| return all_multisets | |||
| # # a dictionary mapping original labels to new ones. | |||
| # set_compressed = {} | |||
| # # if a label occured before, assign its former compressed label, | |||
| # # else assign the number of labels occured + 1 as the compressed label. | |||
| # for value in set_unique: | |||
| # if value in all_set_compressed.keys(): | |||
| # set_compressed.update({value: all_set_compressed[value]}) | |||
| # else: | |||
| # set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
| # num_of_labels_occured += 1 | |||
| # | |||
| # all_set_compressed.update(set_compressed) | |||
| # | |||
| # # relabel nodes | |||
| # for idx, node in enumerate(G.nodes()): | |||
| # G.nodes[node][node_label] = set_compressed[all_multisets[idx]] | |||
| # | |||
| # # get the set of compressed labels | |||
| # labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||
| # all_labels_ori.update(labels_comp) | |||
| # all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
| # return | |||
| def wrapper_wl_iteration(node_label, itr_item): | |||
| g = itr_item[0] | |||
| i = itr_item[1] | |||
| all_multisets = wl_iteration(g, node_label) | |||
| return i, all_multisets | |||
| def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): | |||
| """Compute kernel matrix using the base kernel. | |||
| """ | |||
| if parallel == 'imap_unordered': | |||
| # compute kernels. | |||
| def init_worker(alllabels_toshare): | |||
| global G_alllabels | |||
| G_alllabels = alllabels_toshare | |||
| do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) | |||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
| glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose) | |||
| elif parallel == None: | |||
| for i in range(len(Kmatrix)): | |||
| for j in range(i, len(Kmatrix)): | |||
| Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i], | |||
| all_num_of_each_label[j], Kmatrix[i][j]) | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel): | |||
| """Compute the subtree kernel. | |||
| """ | |||
| labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | |||
| vector1 = np.array([(num_of_each_label1[label] | |||
| if (label in num_of_each_label1.keys()) else 0) | |||
| for label in labels]) | |||
| vector2 = np.array([(num_of_each_label2[label] | |||
| if (label in num_of_each_label2.keys()) else 0) | |||
| for label in labels]) | |||
| kernel += np.dot(vector1, vector2) | |||
| return kernel | |||
| def wrapper_compute_subtree_kernel(Kmatrix, itr): | |||
| i = itr[0] | |||
| j = itr[1] | |||
| return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j]) | |||
| def _wl_spkernel_do(Gn, node_label, edge_label, height): | |||
| """Calculate Weisfeiler-Lehman shortest path kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| node_label : string | |||
| node attribute used as label. | |||
| edge_label : string | |||
| edge attribute used as label. | |||
| height : int | |||
| subtree height. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| pass | |||
| from pygraph.utils.utils import getSPGraph | |||
| # init. | |||
| height = int(height) | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||
| Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn | |||
| # initial for height = 0 | |||
| for i in range(0, len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| for e1 in Gn[i].edges(data = True): | |||
| for e2 in Gn[j].edges(data = True): | |||
| if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
| Kmatrix[i][j] += 1 | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| # iterate each height | |||
| for h in range(1, height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| for G in Gn: # for each graph | |||
| set_multisets = [] | |||
| for node in G.nodes(data = True): | |||
| # Multiset-label determination. | |||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||
| set_multisets.append(multiset) | |||
| # label compression | |||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for node in G.nodes(data = True): | |||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||
| for i in range(0, len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| for e1 in Gn[i].edges(data = True): | |||
| for e2 in Gn[j].edges(data = True): | |||
| if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
| Kmatrix[i][j] += 1 | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| return Kmatrix | |||
| def _wl_edgekernel_do(Gn, node_label, edge_label, height): | |||
| """Calculate Weisfeiler-Lehman edge kernels between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| node_label : string | |||
| node attribute used as label. | |||
| edge_label : string | |||
| edge attribute used as label. | |||
| height : int | |||
| subtree height. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| pass | |||
| # init. | |||
| height = int(height) | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||
| # initial for height = 0 | |||
| for i in range(0, len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| for e1 in Gn[i].edges(data = True): | |||
| for e2 in Gn[j].edges(data = True): | |||
| if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
| Kmatrix[i][j] += 1 | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| # iterate each height | |||
| for h in range(1, height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| for G in Gn: # for each graph | |||
| set_multisets = [] | |||
| for node in G.nodes(data = True): | |||
| # Multiset-label determination. | |||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||
| set_multisets.append(multiset) | |||
| # label compression | |||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for node in G.nodes(data = True): | |||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||
| for i in range(0, len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| for e1 in Gn[i].edges(data = True): | |||
| for e2 in Gn[j].edges(data = True): | |||
| if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
| Kmatrix[i][j] += 1 | |||
| Kmatrix[j][i] = Kmatrix[i][j] | |||
| return Kmatrix | |||
| def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): | |||
| """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. | |||
| Parameters | |||
| ---------- | |||
| Gn : List of NetworkX graph | |||
| List of graphs between which the kernels are calculated. | |||
| node_label : string | |||
| node attribute used as label. | |||
| edge_label : string | |||
| edge attribute used as label. | |||
| height : int | |||
| subtree height. | |||
| base_kernel : string | |||
| Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. | |||
| Return | |||
| ------ | |||
| Kmatrix : Numpy matrix | |||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
| """ | |||
| pass | |||
| # init. | |||
| height = int(height) | |||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||
| # initial for height = 0 | |||
| Kmatrix = base_kernel(Gn, node_label, edge_label) | |||
| # iterate each height | |||
| for h in range(1, height + 1): | |||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
| for G in Gn: # for each graph | |||
| set_multisets = [] | |||
| for node in G.nodes(data = True): | |||
| # Multiset-label determination. | |||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||
| # sorting each multiset | |||
| multiset.sort() | |||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||
| set_multisets.append(multiset) | |||
| # label compression | |||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||
| # a dictionary mapping original labels to new ones. | |||
| set_compressed = {} | |||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
| for value in set_unique: | |||
| if value in all_set_compressed.keys(): | |||
| set_compressed.update({ value : all_set_compressed[value] }) | |||
| else: | |||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
| num_of_labels_occured += 1 | |||
| all_set_compressed.update(set_compressed) | |||
| # relabel nodes | |||
| for node in G.nodes(data = True): | |||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||
| # calculate kernel with h iterations and add it to the final kernel | |||
| Kmatrix += base_kernel(Gn, node_label, edge_label) | |||
| return Kmatrix | |||
| @@ -61,7 +61,7 @@ def polynomialkernel(x, y, d=1, c=0): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| K(x, y) = (x^Ty)^d + c. | |||
| K(x, y) = <x, y> ^d + c. | |||
| Parameters | |||
| ---------- | |||
| @@ -78,6 +78,27 @@ def polynomialkernel(x, y, d=1, c=0): | |||
| return np.dot(x, y) ** d + c | |||
| def linearkernel(x, y): | |||
| """Polynomial kernel. | |||
| Compute the polynomial kernel between x and y: | |||
| K(x, y) = <x, y>. | |||
| Parameters | |||
| ---------- | |||
| x, y : array | |||
| d : integer, default 1 | |||
| c : float, default 0 | |||
| Returns | |||
| ------- | |||
| kernel : float | |||
| """ | |||
| return np.dot(x, y) | |||
| def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | |||
| """Sum of a pair of kernels. | |||
| @@ -241,4 +241,22 @@ def graph_isIdentical(G1, G2): | |||
| return False | |||
| # check graph attributes. | |||
| return True | |||
| return True | |||
| def get_node_labels(Gn, node_label): | |||
| """Get node labels of dataset Gn. | |||
| """ | |||
| nl = set() | |||
| for G in Gn: | |||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
| return nl | |||
| def get_edge_labels(Gn, edge_label): | |||
| """Get edge labels of dataset Gn. | |||
| """ | |||
| el = set() | |||
| for G in Gn: | |||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
| return el | |||