| @@ -44,6 +44,8 @@ Simply clone this repository and voilà! Then check [`notebooks`](https://github | |||||
| * The MinMax kernel | * The MinMax kernel | ||||
| * Non-linear kernels | * Non-linear kernels | ||||
| * The treelet kernel [10] | * The treelet kernel [10] | ||||
| * Weisfeiler-Lehman kernel [11] | |||||
| * Subtree | |||||
| ## Computation optimization methods | ## Computation optimization methods | ||||
| @@ -92,6 +94,8 @@ Linlin Jia, Benoit Gaüzère, and Paul Honeine. Graph Kernels Based on Linear Pa | |||||
| [10] Gaüzere, B., Brun, L., Villemin, D., 2012. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters 33, 2038–2047. | [10] Gaüzere, B., Brun, L., Villemin, D., 2012. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters 33, 2038–2047. | ||||
| [11] Shervashidze, N., Schweitzer, P., Leeuwen, E.J.v., Mehlhorn, K., Borgwardt, K.M., 2011. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research 12, 2539–2561. | |||||
| ## Authors | ## Authors | ||||
| * [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie | * [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie | ||||
| @@ -0,0 +1,188 @@ | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| 1 | |||||
| -1 | |||||
| -1 | |||||
| 1 | |||||
| -1 | |||||
| @@ -1,68 +0,0 @@ | |||||
| === Introduction === | |||||
| This folder contains 6 data sets of undirected labeled graphs in Matlab format for graph | |||||
| classification: MUTAG, PTC, NCI1, NCI109, ENZYMES, and D&D. | |||||
| === Usage === | |||||
| For each data set X, the Matlab command | |||||
| load X | |||||
| loads into the memory a struct array containing graphs, and a column vector lx containing | |||||
| a class label for each graph. | |||||
| X(i).am is the adjacency matrix of the i'th graph, | |||||
| X(i).al is the adjacency list of the i'th graph, | |||||
| X(i).nl.values is a column vector of node labels for the i'th graph, | |||||
| X(i).el (not always available) contains edge labels for the i'th graph. | |||||
| Example: | |||||
| typing "load MUTAG" in MATLAB | |||||
| loads a 188 element array of graph structures, called MUTAG, and a column of 188 numbers, | |||||
| each of which indicates the class that the corresponding graph belongs to. | |||||
| === Description === | |||||
| MUTAG (Debnath et al., 1991) is a data set of 188 mutagenic aromatic and heteroaromatic | |||||
| nitro compounds labeled according to whether or not they have a mutagenic effect on the | |||||
| Gram-negative bacterium Salmonella typhimurium. | |||||
| PTC (Toivonen et al., 2003) contains 344 chemical compounds tested for carcinogenicity | |||||
| in mice and rats. The classification task is to predict the carcinogenicity of compounds. | |||||
| NCI1 and NCI109 represent two balanced subsets of data sets of chemical compounds screened | |||||
| for activity against non-small cell lung cancer and ovarian cancer cell lines respectively | |||||
| (Wale and Karypis (2006) and http://pubchem.ncbi.nlm.nih.gov). | |||||
| ENZYMES is a data set of protein tertiary structures obtained from (Borgwardt et al., | |||||
| 2005) consisting of 600 enzymes from the BRENDA enzyme database (Schomburg et al., 2004). | |||||
| In this case the task is to correctly assign each enzyme to one of the 6 EC top-level | |||||
| classes. | |||||
| D&D is a data set of 1178 protein structures (Dobson and Doig, 2003). Each protein is | |||||
| represented by a graph, in which the nodes are amino acids and two nodes are connected | |||||
| by an edge if they are less than 6 Angstroms apart. The prediction task is to classify | |||||
| the protein structures into enzymes and non-enzymes. | |||||
| === References === | |||||
| K. M. Borgwardt, C. S. Ong, S. Schoenauer, S. V. N. Vishwanathan, A. J. Smola, and H. P. | |||||
| Kriegel. Protein function prediction via graph kernels. Bioinformatics, 21(Suppl 1):i47–i56, | |||||
| Jun 2005. | |||||
| A. K. Debnath, R. L. Lopez de Compadre, G. Debnath, A. J. Shusterman, and C. Hansch. | |||||
| Structure-activity relationship of mutagenic aromatic and heteroaromatic nitro compounds. | |||||
| Correlation with molecular orbital energies and hydrophobicity. J Med Chem, 34: 786–797, | |||||
| 1991. | |||||
| P. D. Dobson and A. J. Doig. Distinguishing enzyme structures from non-enzymes without | |||||
| alignments. J Mol Biol, 330(4):771–783, Jul 2003. | |||||
| I. Schomburg, A. Chang, C. Ebeling, M. Gremse, C. Heldt, G. Huhn, and D. Schomburg. Brenda, | |||||
| the enzyme database: updates and major new developments. Nucleic Acids Research, 32D:431–433, | |||||
| 2004. | |||||
| H. Toivonen, A. Srinivasan, R.D. King, S. Kramer, and C. Helma (2003). Statistical | |||||
| evaluation of the predictive toxicology challenge 2000-2001. Bioinformatics, 19(10):1183–1193. | |||||
| N. Wale and G. Karypis. Comparison of descriptor spaces for chemical compound retrieval and | |||||
| classification. In Proc. of ICDM, pages 678–689, Hong Kong, 2006. | |||||
| @@ -0,0 +1,85 @@ | |||||
| README for dataset MUTAG | |||||
| === Usage === | |||||
| This folder contains the following comma separated text files | |||||
| (replace DS by the name of the dataset): | |||||
| n = total number of nodes | |||||
| m = total number of edges | |||||
| N = number of graphs | |||||
| (1) DS_A.txt (m lines) | |||||
| sparse (block diagonal) adjacency matrix for all graphs, | |||||
| each line corresponds to (row, col) resp. (node_id, node_id) | |||||
| (2) DS_graph_indicator.txt (n lines) | |||||
| column vector of graph identifiers for all nodes of all graphs, | |||||
| the value in the i-th line is the graph_id of the node with node_id i | |||||
| (3) DS_graph_labels.txt (N lines) | |||||
| class labels for all graphs in the dataset, | |||||
| the value in the i-th line is the class label of the graph with graph_id i | |||||
| (4) DS_node_labels.txt (n lines) | |||||
| column vector of node labels, | |||||
| the value in the i-th line corresponds to the node with node_id i | |||||
| There are OPTIONAL files if the respective information is available: | |||||
| (5) DS_edge_labels.txt (m lines; same size as DS_A_sparse.txt) | |||||
| labels for the edges in DD_A_sparse.txt | |||||
| (6) DS_edge_attributes.txt (m lines; same size as DS_A.txt) | |||||
| attributes for the edges in DS_A.txt | |||||
| (7) DS_node_attributes.txt (n lines) | |||||
| matrix of node attributes, | |||||
| the comma seperated values in the i-th line is the attribute vector of the node with node_id i | |||||
| (8) DS_graph_attributes.txt (N lines) | |||||
| regression values for all graphs in the dataset, | |||||
| the value in the i-th line is the attribute of the graph with graph_id i | |||||
| === Description of the dataset === | |||||
| The MUTAG dataset consists of 188 chemical compounds divided into two | |||||
| classes according to their mutagenic effect on a bacterium. | |||||
| The chemical data was obtained form http://cdb.ics.uci.edu and converted | |||||
| to graphs, where vertices represent atoms and edges represent chemical | |||||
| bonds. Explicit hydrogen atoms have been removed and vertices are labeled | |||||
| by atom type and edges by bond type (single, double, triple or aromatic). | |||||
| Chemical data was processed using the Chemistry Development Kit (v1.4). | |||||
| Node labels: | |||||
| 0 C | |||||
| 1 N | |||||
| 2 O | |||||
| 3 F | |||||
| 4 I | |||||
| 5 Cl | |||||
| 6 Br | |||||
| Edge labels: | |||||
| 0 aromatic | |||||
| 1 single | |||||
| 2 double | |||||
| 3 triple | |||||
| === Previous Use of the Dataset === | |||||
| Kriege, N., Mutzel, P.: Subgraph matching kernels for attributed graphs. In: Proceedings | |||||
| of the 29th International Conference on Machine Learning (ICML-2012) (2012). | |||||
| === References === | |||||
| Debnath, A.K., Lopez de Compadre, R.L., Debnath, G., Shusterman, A.J., and Hansch, C. | |||||
| Structure-activity relationship of mutagenic aromatic and heteroaromatic nitro compounds. | |||||
| Correlation with molecular orbital energies and hydrophobicity. J. Med. Chem. 34(2):786-797 (1991). | |||||
| @@ -73,20 +73,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -15,20 +15,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -82,4 +80,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -104,20 +104,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -15,20 +15,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -219,20 +219,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -20,20 +20,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -65,6 +63,7 @@ dslist = [ | |||||
| estimator = randomwalkkernel | estimator = randomwalkkernel | ||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| gaussiankernel = functools.partial(gaussiankernel, gamma=0.5) | |||||
| for ds in dslist: | for ds in dslist: | ||||
| print() | print() | ||||
| @@ -108,4 +107,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -171,21 +171,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "\n", | |||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "#\n", | "#\n", | ||||
| "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| "# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | "# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -11,21 +11,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # | # | ||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -79,4 +76,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -124,20 +124,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -17,20 +17,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -86,4 +84,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -100,20 +100,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| "# # node nsymb\n", | "# # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -1,7 +1,7 @@ | |||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
| """ | """ | ||||
| Created on Fri Oct 5 19:19:33 2018 | |||||
| Created on Mon Mar 21 11:19:33 2019 | |||||
| @author: ljia | @author: ljia | ||||
| """ | """ | ||||
| @@ -10,26 +10,24 @@ from libs import * | |||||
| import multiprocessing | import multiprocessing | ||||
| from pygraph.kernels.treeletKernel import treeletkernel | from pygraph.kernels.treeletKernel import treeletkernel | ||||
| from pygraph.utils.kernels import gaussiankernel, polynomialkernel | |||||
| from pygraph.utils.kernels import gaussiankernel, linearkernel, polynomialkernel | |||||
| dslist = [ | dslist = [ | ||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # # node nsymb | # # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -59,7 +57,7 @@ dslist = [ | |||||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
| ] | ] | ||||
| estimator = treeletkernel | estimator = treeletkernel | ||||
| param_grid_precomputed = {'sub_kernel': [gaussiankernel, polynomialkernel]} | |||||
| param_grid_precomputed = {'sub_kernel': [gaussiankernel, linearkernel, polynomialkernel]} | |||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| @@ -80,4 +78,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -227,13 +227,7 @@ | |||||
| "the gram matrix with parameters {'compute_method': 'trie', 'depth': 1.0, 'k_func': 'tanimoto', 'n_jobs': 8, 'verbose': True} is: \n", | "the gram matrix with parameters {'compute_method': 'trie', 'depth': 1.0, 'k_func': 'tanimoto', 'n_jobs': 8, 'verbose': True} is: \n", | ||||
| "\n", | "\n", | ||||
| "\n", | "\n", | ||||
| "getting paths: 150it [00:00, 27568.71it/s]\n" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "getting paths: 150it [00:00, 27568.71it/s]\n", | |||||
| "calculating kernels: 11325it [00:00, 780628.98it/s]\n", | "calculating kernels: 11325it [00:00, 780628.98it/s]\n", | ||||
| "\n", | "\n", | ||||
| " --- kernel matrix of path kernel up to 2 of size 150 built in 0.2590019702911377 seconds ---\n", | " --- kernel matrix of path kernel up to 2 of size 150 built in 0.2590019702911377 seconds ---\n", | ||||
| @@ -265,20 +259,18 @@ | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | ||||
| " 'task': 'regression'}, # node symb\n", | " 'task': 'regression'}, # node symb\n", | ||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | ||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | " # contains single node graph, node symb\n", | ||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | ||||
| " # node nsymb\n", | " # node nsymb\n", | ||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | ||||
| " # node symb/nsymb\n", | " # node symb/nsymb\n", | ||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
| "# # node/edge symb\n", | "# # node/edge symb\n", | ||||
| "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
| "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
| "# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | "\n", | ||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | ||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | ||||
| @@ -15,20 +15,18 @@ dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
| 'task': 'regression'}, # node symb | 'task': 'regression'}, # node symb | ||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | ||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | # contains single node graph, node symb | ||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| # node nsymb | # node nsymb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | ||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | ||||
| @@ -81,4 +79,4 @@ for ds in dslist: | |||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | |||||
| print() | |||||
| @@ -0,0 +1,144 @@ | |||||
| { | |||||
| "cells": [ | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": null, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "\n", | |||||
| "MUTAG\n", | |||||
| "\n", | |||||
| "--- This is a classification problem ---\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "1. Loading dataset from file...\n", | |||||
| "\n", | |||||
| "2. Calculating gram matrices. This could take a while...\n", | |||||
| "\n", | |||||
| " --- Weisfeiler-Lehman subtree kernel matrix of size 188 built in 0.14636015892028809 seconds ---\n", | |||||
| "\n", | |||||
| "the gram matrix with parameters {'base_kernel': 'subtree', 'height': 0.0, 'n_jobs': 8, 'verbose': True} is: \n", | |||||
| "\n", | |||||
| "\n", | |||||
| "\n", | |||||
| " --- Weisfeiler-Lehman subtree kernel matrix of size 188 built in 0.2917311191558838 seconds ---\n", | |||||
| "\n", | |||||
| "the gram matrix with parameters {'base_kernel': 'subtree', 'height': 1.0, 'n_jobs': 8, 'verbose': True} is: \n", | |||||
| "\n", | |||||
| "\n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "#!/usr/bin/env python3\n", | |||||
| "# -*- coding: utf-8 -*-\n", | |||||
| "\"\"\"\n", | |||||
| "Created on Mon Mar 21 11:19:33 2019\n", | |||||
| "\n", | |||||
| "@author: ljia\n", | |||||
| "\"\"\"\n", | |||||
| "\n", | |||||
| "from libs import *\n", | |||||
| "import multiprocessing\n", | |||||
| "\n", | |||||
| "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n", | |||||
| "from pygraph.utils.kernels import gaussiankernel, polynomialkernel\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "dslist = [\n", | |||||
| " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||||
| " 'task': 'regression'}, # node symb\n", | |||||
| " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||||
| " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, \n", | |||||
| " # contains single node graph, node symb\n", | |||||
| " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb\n", | |||||
| " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled\n", | |||||
| " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb\n", | |||||
| " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
| " # node nsymb\n", | |||||
| " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||||
| " # node symb/nsymb\n", | |||||
| "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||||
| "# # node/edge symb\n", | |||||
| " {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb\n", | |||||
| "\n", | |||||
| " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||||
| " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||||
| " #\n", | |||||
| " # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||||
| " # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||||
| " # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||||
| "\n", | |||||
| " # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| " # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||||
| "# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||||
| " {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| " {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||||
| " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
| " # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||||
| " # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||||
| "\n", | |||||
| " # # not working below\n", | |||||
| " # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||||
| " # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||||
| " # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||||
| " # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||||
| "]\n", | |||||
| "estimator = weisfeilerlehmankernel\n", | |||||
| "param_grid_precomputed = {'base_kernel': ['subtree'], \n", | |||||
| " 'height': np.linspace(0, 10, 11)}\n", | |||||
| "param_grid = [{'C': np.logspace(-10, 4, num=29, base=10)},\n", | |||||
| " {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n", | |||||
| "\n", | |||||
| "for ds in dslist:\n", | |||||
| " print()\n", | |||||
| " print(ds['name'])\n", | |||||
| " model_selection_for_precomputed_kernel(\n", | |||||
| " ds['dataset'],\n", | |||||
| " estimator,\n", | |||||
| " param_grid_precomputed,\n", | |||||
| " (param_grid[1] if ('task' in ds and ds['task']\n", | |||||
| " == 'regression') else param_grid[0]),\n", | |||||
| " (ds['task'] if 'task' in ds else 'classification'),\n", | |||||
| " NUM_TRIALS=30,\n", | |||||
| " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | |||||
| " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | |||||
| " ds_name=ds['name'],\n", | |||||
| " n_jobs=multiprocessing.cpu_count(),\n", | |||||
| " read_gm_from_file=False,\n", | |||||
| " verbose=True)\n", | |||||
| " print()" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "metadata": { | |||||
| "kernelspec": { | |||||
| "display_name": "Python 3", | |||||
| "language": "python", | |||||
| "name": "python3" | |||||
| }, | |||||
| "language_info": { | |||||
| "codemirror_mode": { | |||||
| "name": "ipython", | |||||
| "version": 3 | |||||
| }, | |||||
| "file_extension": ".py", | |||||
| "mimetype": "text/x-python", | |||||
| "name": "python", | |||||
| "nbconvert_exporter": "python", | |||||
| "pygments_lexer": "ipython3", | |||||
| "version": "3.6.7" | |||||
| } | |||||
| }, | |||||
| "nbformat": 4, | |||||
| "nbformat_minor": 2 | |||||
| } | |||||
| @@ -0,0 +1,81 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*- coding: utf-8 -*- | |||||
| """ | |||||
| Created on Mon Mar 21 11:19:33 2019 | |||||
| @author: ljia | |||||
| """ | |||||
| from libs import * | |||||
| import multiprocessing | |||||
| from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||||
| from pygraph.utils.kernels import gaussiankernel, polynomialkernel | |||||
| dslist = [ | |||||
| {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
| 'task': 'regression'}, # node symb | |||||
| {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
| 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, | |||||
| # contains single node graph, node symb | |||||
| {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb | |||||
| {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled | |||||
| {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb | |||||
| {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
| # node nsymb | |||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
| # node symb/nsymb | |||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||||
| # # node/edge symb | |||||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # | |||||
| # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
| # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
| # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
| # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
| # | |||||
| # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
| # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
| # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
| # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
| # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
| # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||||
| # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
| # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
| # # not working below | |||||
| # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
| # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
| # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
| ] | |||||
| estimator = weisfeilerlehmankernel | |||||
| param_grid_precomputed = {'base_kernel': ['subtree'], | |||||
| 'height': np.linspace(0, 10, 11)} | |||||
| param_grid = [{'C': np.logspace(-10, 4, num=29, base=10)}, | |||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||||
| for ds in dslist: | |||||
| print() | |||||
| print(ds['name']) | |||||
| model_selection_for_precomputed_kernel( | |||||
| ds['dataset'], | |||||
| estimator, | |||||
| param_grid_precomputed, | |||||
| (param_grid[1] if ('task' in ds and ds['task'] | |||||
| == 'regression') else param_grid[0]), | |||||
| (ds['task'] if 'task' in ds else 'classification'), | |||||
| NUM_TRIALS=30, | |||||
| datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
| ds_name=ds['name'], | |||||
| n_jobs=multiprocessing.cpu_count(), | |||||
| read_gm_from_file=False, | |||||
| verbose=True) | |||||
| print() | |||||
| @@ -16,7 +16,7 @@ import librariesImport, script | |||||
| sys.path.insert(0, "../") | sys.path.insert(0, "../") | ||||
| from pygraph.utils.graphfiles import saveDataset | from pygraph.utils.graphfiles import saveDataset | ||||
| from pygraph.utils.graphdataset import get_dataset_attributes | from pygraph.utils.graphdataset import get_dataset_attributes | ||||
| from pygraph.utils.utils import graph_isIdentical | |||||
| from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels | |||||
| #from pygraph.utils.utils import graph_deepcopy | #from pygraph.utils.utils import graph_deepcopy | ||||
| @@ -158,9 +158,9 @@ def GED(g1, g2, lib='gedlib'): | |||||
| script.PyRestartEnv() | script.PyRestartEnv() | ||||
| script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') | script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') | ||||
| listID = script.PyGetGraphIds() | listID = script.PyGetGraphIds() | ||||
| script.PySetEditCost("CHEM_2") | |||||
| script.PySetEditCost("CHEM_1") | |||||
| script.PyInitEnv() | script.PyInitEnv() | ||||
| script.PySetMethod("BIPARTITE", "") | |||||
| script.PySetMethod("IPFP", "") | |||||
| script.PyInitMethod() | script.PyInitMethod() | ||||
| g = listID[0] | g = listID[0] | ||||
| h = listID[1] | h = listID[1] | ||||
| @@ -173,20 +173,6 @@ def GED(g1, g2, lib='gedlib'): | |||||
| return dis, pi_forward, pi_backward | return dis, pi_forward, pi_backward | ||||
| def get_node_labels(Gn, node_label): | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||
| # --------------------------- These are tests --------------------------------# | # --------------------------- These are tests --------------------------------# | ||||
| def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, | def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, | ||||
| @@ -65,6 +65,7 @@ def marginalizedkernel(*args, | |||||
| # pre-process | # pre-process | ||||
| n_iteration = int(n_iteration) | n_iteration = int(n_iteration) | ||||
| Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| ds_attrs = get_dataset_attributes( | ds_attrs = get_dataset_attributes( | ||||
| Gn, | Gn, | ||||
| @@ -215,37 +216,37 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||||
| R_inf = {} # dict to save all the R_inf for all pairs of nodes | R_inf = {} # dict to save all the R_inf for all pairs of nodes | ||||
| # initial R_inf, the 1st iteration. | # initial R_inf, the 1st iteration. | ||||
| for node1 in g1.nodes(data=True): | |||||
| for node2 in g2.nodes(data=True): | |||||
| for node1 in g1.nodes(): | |||||
| for node2 in g2.nodes(): | |||||
| # R_inf[(node1[0], node2[0])] = r1 | # R_inf[(node1[0], node2[0])] = r1 | ||||
| if len(g1[node1[0]]) > 0: | |||||
| if len(g2[node2[0]]) > 0: | |||||
| R_inf[(node1[0], node2[0])] = r1 | |||||
| if len(g1[node1]) > 0: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| else: | else: | ||||
| R_inf[(node1[0], node2[0])] = p_quit | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | else: | ||||
| if len(g2[node2[0]]) > 0: | |||||
| R_inf[(node1[0], node2[0])] = p_quit | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | else: | ||||
| R_inf[(node1[0], node2[0])] = 1 | |||||
| R_inf[(node1, node2)] = 1 | |||||
| # compute all transition probability first. | # compute all transition probability first. | ||||
| t_dict = {} | t_dict = {} | ||||
| if n_iteration > 1: | if n_iteration > 1: | ||||
| for node1 in g1.nodes(data=True): | |||||
| neighbor_n1 = g1[node1[0]] | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | # the transition probability distribution in the random walks | ||||
| # generating step (uniform distribution over the vertices adjacent | # generating step (uniform distribution over the vertices adjacent | ||||
| # to the current vertex) | # to the current vertex) | ||||
| if len(neighbor_n1) > 0: | if len(neighbor_n1) > 0: | ||||
| p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | ||||
| for node2 in g2.nodes(data=True): | |||||
| neighbor_n2 = g2[node2[0]] | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | if len(neighbor_n2) > 0: | ||||
| p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | ||||
| for neighbor1 in neighbor_n1: | for neighbor1 in neighbor_n1: | ||||
| for neighbor2 in neighbor_n2: | for neighbor2 in neighbor_n2: | ||||
| t_dict[(node1[0], node2[0], neighbor1, neighbor2)] = \ | |||||
| t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||||
| p_trans_n1 * p_trans_n2 * \ | p_trans_n1 * p_trans_n2 * \ | ||||
| deltakernel(g1.node[neighbor1][node_label], | deltakernel(g1.node[neighbor1][node_label], | ||||
| g2.node[neighbor2][node_label]) * \ | g2.node[neighbor2][node_label]) * \ | ||||
| @@ -258,20 +259,20 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||||
| R_inf_old = R_inf.copy() | R_inf_old = R_inf.copy() | ||||
| # calculate R_inf for each pair of nodes | # calculate R_inf for each pair of nodes | ||||
| for node1 in g1.nodes(data=True): | |||||
| neighbor_n1 = g1[node1[0]] | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | # the transition probability distribution in the random walks | ||||
| # generating step (uniform distribution over the vertices adjacent | # generating step (uniform distribution over the vertices adjacent | ||||
| # to the current vertex) | # to the current vertex) | ||||
| if len(neighbor_n1) > 0: | if len(neighbor_n1) > 0: | ||||
| for node2 in g2.nodes(data=True): | |||||
| neighbor_n2 = g2[node2[0]] | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | if len(neighbor_n2) > 0: | ||||
| R_inf[(node1[0], node2[0])] = r1 | |||||
| R_inf[(node1, node2)] = r1 | |||||
| for neighbor1 in neighbor_n1: | for neighbor1 in neighbor_n1: | ||||
| for neighbor2 in neighbor_n2: | for neighbor2 in neighbor_n2: | ||||
| R_inf[(node1[0], node2[0])] += \ | |||||
| (t_dict[(node1[0], node2[0], neighbor1, neighbor2)] * \ | |||||
| R_inf[(node1, node2)] += \ | |||||
| (t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||||
| R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | ||||
| # add elements of R_inf up and calculate kernel | # add elements of R_inf up and calculate kernel | ||||
| @@ -58,6 +58,7 @@ def randomwalkkernel(*args, | |||||
| """ | """ | ||||
| compute_method = compute_method.lower() | compute_method = compute_method.lower() | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| eweight = None | eweight = None | ||||
| if edge_weight == None: | if edge_weight == None: | ||||
| @@ -54,6 +54,7 @@ def spkernel(*args, | |||||
| """ | """ | ||||
| # pre-process | # pre-process | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| weight = None | weight = None | ||||
| if edge_weight is None: | if edge_weight is None: | ||||
| if verbose: | if verbose: | ||||
| @@ -74,6 +74,7 @@ def structuralspkernel(*args, | |||||
| """ | """ | ||||
| # pre-process | # pre-process | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| weight = None | weight = None | ||||
| if edge_weight is None: | if edge_weight is None: | ||||
| if verbose: | if verbose: | ||||
| @@ -1,6 +1,8 @@ | |||||
| """ | """ | ||||
| @author: linlin | @author: linlin | ||||
| @references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||||
| @references: | |||||
| [1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in | |||||
| chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||||
| """ | """ | ||||
| import sys | import sys | ||||
| @@ -50,6 +52,7 @@ def treeletkernel(*args, | |||||
| """ | """ | ||||
| # pre-process | # pre-process | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | Kmatrix = np.zeros((len(Gn), len(Gn))) | ||||
| ds_attrs = get_dataset_attributes(Gn, | ds_attrs = get_dataset_attributes(Gn, | ||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | ||||
| @@ -76,13 +79,13 @@ def treeletkernel(*args, | |||||
| else: | else: | ||||
| chunksize = 100 | chunksize = 100 | ||||
| canonkeys = [[] for _ in range(len(Gn))] | canonkeys = [[] for _ in range(len(Gn))] | ||||
| getps_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||||
| get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||||
| labeled, ds_attrs['is_directed']) | labeled, ds_attrs['is_directed']) | ||||
| if verbose: | if verbose: | ||||
| iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), | |||||
| iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), | |||||
| desc='getting canonkeys', file=sys.stdout) | desc='getting canonkeys', file=sys.stdout) | ||||
| else: | else: | ||||
| iterator = pool.imap_unordered(getps_partial, itr, chunksize) | |||||
| iterator = pool.imap_unordered(get_partial, itr, chunksize) | |||||
| for i, ck in iterator: | for i, ck in iterator: | ||||
| canonkeys[i] = ck | canonkeys[i] = ck | ||||
| pool.close() | pool.close() | ||||
| @@ -1,382 +0,0 @@ | |||||
| """ | |||||
| @author: linlin | |||||
| @references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||||
| """ | |||||
| import sys | |||||
| import pathlib | |||||
| sys.path.insert(0, "../") | |||||
| import time | |||||
| from collections import Counter | |||||
| from itertools import chain | |||||
| import networkx as nx | |||||
| import numpy as np | |||||
| def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
| """Calculate treelet graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| / | |||||
| G1, G2 : NetworkX graphs | |||||
| 2 graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix/kernel : Numpy matrix/float | |||||
| Kernel matrix, each element of which is the treelet kernel between 2 praphs. / Treelet kernel between 2 graphs. | |||||
| """ | |||||
| if len(args) == 1: # for a list of graphs | |||||
| Gn = args[0] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| start_time = time.time() | |||||
| # get all canonical keys of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
| canonkeys = [ get_canonkeys(Gn[i], node_label = node_label, edge_label = edge_label, labeled = labeled) \ | |||||
| for i in range(0, len(Gn)) ] | |||||
| for i in range(0, len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| run_time = time.time() - start_time | |||||
| print("\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time | |||||
| else: # for only 2 graphs | |||||
| start_time = time.time() | |||||
| canonkey1 = get_canonkeys(args[0], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
| canonkey2 = get_canonkeys(args[1], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
| kernel = _treeletkernel_do(canonkey1, canonkey2, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
| run_time = time.time() - start_time | |||||
| print("\n --- treelet kernel built in %s seconds ---" % (run_time)) | |||||
| return kernel, run_time | |||||
| def _treeletkernel_do(canonkey1, canonkey2, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
| """Calculate treelet graph kernel between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| canonkey1, canonkey2 : list | |||||
| List of canonical keys in 2 graphs, where each key is represented by a string. | |||||
| node_label : string | |||||
| Node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| Edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Treelet Kernel between 2 graphs. | |||||
| """ | |||||
| keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||||
| vector1 = np.array([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) | |||||
| vector2 = np.array([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) | |||||
| kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) | |||||
| return kernel | |||||
| def get_canonkeys(G, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
| """Generate canonical keys of all treelets in a graph. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which keys are generated. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| labeled : boolean | |||||
| Whether the graphs are labeled. The default is True. | |||||
| Return | |||||
| ------ | |||||
| canonkey/canonkey_l : dict | |||||
| For unlabeled graphs, canonkey is a dictionary which records amount of every tree pattern. For labeled graphs, canonkey_l is one which keeps track of amount of every treelet. | |||||
| """ | |||||
| patterns = {} # a dictionary which consists of lists of patterns for all graphlet. | |||||
| canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. | |||||
| ### structural analysis ### | |||||
| ### In this section, a list of patterns is generated for each graphlet, where every pattern is represented by nodes ordered by | |||||
| ### Morgan's extended labeling. | |||||
| # linear patterns | |||||
| patterns['0'] = G.nodes() | |||||
| canonkey['0'] = nx.number_of_nodes(G) | |||||
| for i in range(1, 6): # for i in range(1, 6): | |||||
| patterns[str(i)] = find_all_paths(G, i) | |||||
| canonkey[str(i)] = len(patterns[str(i)]) | |||||
| # n-star patterns | |||||
| patterns['3star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3 ] | |||||
| patterns['4star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4 ] | |||||
| patterns['5star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5 ] | |||||
| # n-star patterns | |||||
| canonkey['6'] = len(patterns['3star']) | |||||
| canonkey['8'] = len(patterns['4star']) | |||||
| canonkey['d'] = len(patterns['5star']) | |||||
| # pattern 7 | |||||
| patterns['7'] = [] # the 1st line of Table 1 in Ref [1] | |||||
| for pattern in patterns['3star']: | |||||
| for i in range(1, len(pattern)): # for each neighbor of node 0 | |||||
| if G.degree(pattern[i]) >= 2: | |||||
| pattern_t = pattern[:] | |||||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] # set the node with degree >= 2 as the 4th node | |||||
| for neighborx in G[pattern[i]]: | |||||
| if neighborx != pattern[0]: | |||||
| new_pattern = pattern_t + [ neighborx ] | |||||
| patterns['7'].append(new_pattern) | |||||
| canonkey['7'] = len(patterns['7']) | |||||
| # pattern 11 | |||||
| patterns['11'] = [] # the 4th line of Table 1 in Ref [1] | |||||
| for pattern in patterns['4star']: | |||||
| for i in range(1, len(pattern)): | |||||
| if G.degree(pattern[i]) >= 2: | |||||
| pattern_t = pattern[:] | |||||
| pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i] | |||||
| for neighborx in G[pattern[i]]: | |||||
| if neighborx != pattern[0]: | |||||
| new_pattern = pattern_t + [ neighborx ] | |||||
| patterns['11'].append(new_pattern) | |||||
| canonkey['b'] = len(patterns['11']) | |||||
| # pattern 12 | |||||
| patterns['12'] = [] # the 5th line of Table 1 in Ref [1] | |||||
| rootlist = [] # a list of root nodes, whose extended labels are 3 | |||||
| for pattern in patterns['3star']: | |||||
| if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes | |||||
| rootlist.append(pattern[0]) | |||||
| for i in range(1, len(pattern)): | |||||
| if G.degree(pattern[i]) >= 3: | |||||
| rootlist.append(pattern[i]) | |||||
| pattern_t = pattern[:] | |||||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||||
| for neighborx1 in G[pattern[i]]: | |||||
| if neighborx1 != pattern[0]: | |||||
| for neighborx2 in G[pattern[i]]: | |||||
| if neighborx1 > neighborx2 and neighborx2 != pattern[0]: | |||||
| new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||||
| # new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] | |||||
| patterns['12'].append(new_pattern) | |||||
| canonkey['c'] = int(len(patterns['12']) / 2) | |||||
| # pattern 9 | |||||
| patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] | |||||
| for pattern in patterns['3star']: | |||||
| for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \ | |||||
| for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]: | |||||
| pattern_t = pattern[:] | |||||
| # move nodes with extended labels 4 to specific position to correspond to their children | |||||
| pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])] | |||||
| pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])] | |||||
| for neighborx1 in G[pairs[0]]: | |||||
| if neighborx1 != pattern[0]: | |||||
| for neighborx2 in G[pairs[1]]: | |||||
| if neighborx2 != pattern[0]: | |||||
| new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||||
| patterns['9'].append(new_pattern) | |||||
| canonkey['9'] = len(patterns['9']) | |||||
| # pattern 10 | |||||
| patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] | |||||
| for pattern in patterns['3star']: | |||||
| for i in range(1, len(pattern)): | |||||
| if G.degree(pattern[i]) >= 2: | |||||
| for neighborx in G[pattern[i]]: | |||||
| if neighborx != pattern[0] and G.degree(neighborx) >= 2: | |||||
| pattern_t = pattern[:] | |||||
| pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||||
| new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] | |||||
| patterns['10'].extend(new_patterns) | |||||
| canonkey['a'] = len(patterns['10']) | |||||
| ### labeling information ### | |||||
| ### In this section, a list of canonical keys is generated for every pattern obtained in the structural analysis | |||||
| ### section above, which is a string corresponding to a unique treelet. A dictionary is built to keep track of | |||||
| ### the amount of every treelet. | |||||
| if labeled == True: | |||||
| canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | |||||
| # linear patterns | |||||
| canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) | |||||
| for key in canonkey_t: | |||||
| canonkey_l['0' + key] = canonkey_t[key] | |||||
| for i in range(1, 6): # for i in range(1, 6): | |||||
| treelet = [] | |||||
| for pattern in patterns[str(i)]: | |||||
| canonlist = list(chain.from_iterable((G.node[node][node_label], \ | |||||
| G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) | |||||
| canonlist.append(G.node[pattern[-1]][node_label]) | |||||
| canonkey_t = ''.join(canonlist) | |||||
| canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] | |||||
| treelet.append(str(i) + canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # n-star patterns | |||||
| for i in range(3, 6): | |||||
| treelet = [] | |||||
| for pattern in patterns[str(i) + 'star']: | |||||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ] | |||||
| canonlist.sort() | |||||
| canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist) | |||||
| treelet.append(canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # pattern 7 | |||||
| treelet = [] | |||||
| for pattern in patterns['7']: | |||||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
| canonlist.sort() | |||||
| canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||||
| + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] | |||||
| treelet.append(canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # pattern 11 | |||||
| treelet = [] | |||||
| for pattern in patterns['11']: | |||||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ] | |||||
| canonlist.sort() | |||||
| canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \ | |||||
| + G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||||
| treelet.append(canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # pattern 10 | |||||
| treelet = [] | |||||
| for pattern in patterns['10']: | |||||
| canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||||
| canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
| canonlist.sort() | |||||
| canonkey0 = ''.join(canonlist) | |||||
| canonkey_t = 'a' + G.node[pattern[3]][node_label] \ | |||||
| + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \ | |||||
| + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||||
| + canonkey4 + canonkey0 | |||||
| treelet.append(canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # pattern 12 | |||||
| treelet = [] | |||||
| for pattern in patterns['12']: | |||||
| canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
| canonlist0.sort() | |||||
| canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ] | |||||
| canonlist3.sort() | |||||
| # 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order. | |||||
| canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \ | |||||
| + ''.join(canonlist0) \ | |||||
| + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||||
| + ''.join(canonlist3) | |||||
| canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \ | |||||
| + ''.join(canonlist3) \ | |||||
| + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||||
| + ''.join(canonlist0) | |||||
| treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| # pattern 9 | |||||
| treelet = [] | |||||
| for pattern in patterns['9']: | |||||
| canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label] | |||||
| canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label] | |||||
| prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label] | |||||
| prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] | |||||
| if prekey2 + canonkey2 < prekey3 + canonkey3: | |||||
| canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||||
| + prekey2 + prekey3 + canonkey2 + canonkey3 | |||||
| else: | |||||
| canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||||
| + prekey3 + prekey2 + canonkey3 + canonkey2 | |||||
| treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t) | |||||
| canonkey_l.update(Counter(treelet)) | |||||
| return canonkey_l | |||||
| return canonkey | |||||
| def find_paths(G, source_node, length): | |||||
| """Find all paths with a certain length those start from a source node. A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| source_node : integer | |||||
| The number of the node from where all paths start. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| if length == 0: | |||||
| return [[source_node]] | |||||
| path = [ [source_node] + path for neighbor in G[source_node] \ | |||||
| for path in find_paths(G, neighbor, length - 1) if source_node not in path ] | |||||
| return path | |||||
| def find_all_paths(G, length): | |||||
| """Find all paths with a certain length in a graph. A recursive depth first search is applied. | |||||
| Parameters | |||||
| ---------- | |||||
| G : NetworkX graphs | |||||
| The graph in which paths are searched. | |||||
| length : integer | |||||
| The length of paths. | |||||
| Return | |||||
| ------ | |||||
| path : list of list | |||||
| List of paths retrieved, where each path is represented by a list of nodes. | |||||
| """ | |||||
| all_paths = [] | |||||
| for node in G: | |||||
| all_paths.extend(find_paths(G, node, length)) | |||||
| all_paths_r = [ path[::-1] for path in all_paths ] | |||||
| # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
| for idx, path in enumerate(all_paths[:-1]): | |||||
| for path2 in all_paths_r[idx+1::]: | |||||
| if path == path2: | |||||
| all_paths[idx] = [] | |||||
| break | |||||
| return list(filter(lambda a: a != [], all_paths)) | |||||
| @@ -60,6 +60,7 @@ def untilhpathkernel(*args, | |||||
| # pre-process | # pre-process | ||||
| depth = int(depth) | depth = int(depth) | ||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
| Gn = [g.copy() for g in Gn] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | Kmatrix = np.zeros((len(Gn), len(Gn))) | ||||
| ds_attrs = get_dataset_attributes( | ds_attrs = get_dataset_attributes( | ||||
| Gn, | Gn, | ||||
| @@ -0,0 +1,549 @@ | |||||
| """ | |||||
| @author: linlin | |||||
| @references: | |||||
| [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. | |||||
| Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. | |||||
| 2011;12(Sep):2539-61. | |||||
| """ | |||||
| import sys | |||||
| from collections import Counter | |||||
| sys.path.insert(0, "../") | |||||
| from functools import partial | |||||
| import time | |||||
| #from multiprocessing import Pool | |||||
| from tqdm import tqdm | |||||
| import networkx as nx | |||||
| import numpy as np | |||||
| #from pygraph.kernels.pathKernel import pathkernel | |||||
| from pygraph.utils.graphdataset import get_dataset_attributes | |||||
| from pygraph.utils.parallel import parallel_gm | |||||
| # @todo: support edge kernel, sp kernel, user-defined kernel. | |||||
| def weisfeilerlehmankernel(*args, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| height=0, | |||||
| base_kernel='subtree', | |||||
| parallel=None, | |||||
| n_jobs=None, | |||||
| verbose=True): | |||||
| """Calculate Weisfeiler-Lehman kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| / | |||||
| G1, G2 : NetworkX graphs | |||||
| 2 graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| node attribute used as label. The default node label is atom. | |||||
| edge_label : string | |||||
| edge attribute used as label. The default edge label is bond_type. | |||||
| height : int | |||||
| subtree height | |||||
| base_kernel : string | |||||
| base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
| Notes | |||||
| ----- | |||||
| This function now supports WL subtree kernel only. | |||||
| """ | |||||
| # pre-process | |||||
| base_kernel = base_kernel.lower() | |||||
| Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list | |||||
| Gn = [g.copy() for g in Gn] | |||||
| ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], | |||||
| node_label=node_label) | |||||
| if not ds_attrs['node_labeled']: | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| start_time = time.time() | |||||
| # for WL subtree kernel | |||||
| if base_kernel == 'subtree': | |||||
| Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||||
| # for WL shortest path kernel | |||||
| elif base_kernel == 'sp': | |||||
| Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height) | |||||
| # for WL edge kernel | |||||
| elif base_kernel == 'edge': | |||||
| Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height) | |||||
| # for user defined base kernel | |||||
| else: | |||||
| Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel) | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" | |||||
| % (base_kernel, len(args[0]), run_time)) | |||||
| return Kmatrix, run_time | |||||
| def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose): | |||||
| """Calculate Weisfeiler-Lehman kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| height : int | |||||
| wl height. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
| """ | |||||
| height = int(height) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # initial for height = 0 | |||||
| all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||||
| # for each graph | |||||
| for G in Gn: | |||||
| # get the set of original labels | |||||
| labels_ori = list(nx.get_node_attributes(G, node_label).values()) | |||||
| # number of occurence of each label in G | |||||
| all_num_of_each_label.append(dict(Counter(labels_ori))) | |||||
| # calculate subtree kernel with the 0th iteration and add it to the final kernel | |||||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||||
| # iterate each height | |||||
| for h in range(1, height + 1): | |||||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
| # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||||
| all_num_of_each_label = [] # number of occurence of each label in G | |||||
| # # for each graph | |||||
| # # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| # pool = Pool(n_jobs) | |||||
| # itr = zip(Gn, range(0, len(Gn))) | |||||
| # if len(Gn) < 100 * n_jobs: | |||||
| # chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| # else: | |||||
| # chunksize = 100 | |||||
| # all_multisets_list = [[] for _ in range(len(Gn))] | |||||
| ## set_unique_list = [[] for _ in range(len(Gn))] | |||||
| # get_partial = partial(wrapper_wl_iteration, node_label) | |||||
| ## if verbose: | |||||
| ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), | |||||
| ## desc='wl iteration', file=sys.stdout) | |||||
| ## else: | |||||
| # iterator = pool.imap_unordered(get_partial, itr, chunksize) | |||||
| # for i, all_multisets in iterator: | |||||
| # all_multisets_list[i] = all_multisets | |||||
| ## set_unique_list[i] = set_unique | |||||
| ## all_set_unique = all_set_unique | set(set_unique) | |||||
| # pool.close() | |||||
| # pool.join() | |||||
| # all_set_unique = set() | |||||
| # for uset in all_multisets_list: | |||||
| # all_set_unique = all_set_unique | set(uset) | |||||
| # | |||||
| # all_set_unique = list(all_set_unique) | |||||
| ## # a dictionary mapping original labels to new ones. | |||||
| ## set_compressed = {} | |||||
| ## for idx, uset in enumerate(all_set_unique): | |||||
| ## set_compressed.update({uset: idx}) | |||||
| # | |||||
| # for ig, G in enumerate(Gn): | |||||
| # | |||||
| ## # a dictionary mapping original labels to new ones. | |||||
| ## set_compressed = {} | |||||
| ## # if a label occured before, assign its former compressed label, | |||||
| ## # else assign the number of labels occured + 1 as the compressed label. | |||||
| ## for value in set_unique_list[i]: | |||||
| ## if uset in all_set_unique: | |||||
| ## set_compressed.update({uset: all_set_compressed[value]}) | |||||
| ## else: | |||||
| ## set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||||
| ## num_of_labels_occured += 1 | |||||
| # | |||||
| ## all_set_compressed.update(set_compressed) | |||||
| # | |||||
| # # relabel nodes | |||||
| # for idx, node in enumerate(G.nodes()): | |||||
| # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx]) | |||||
| # | |||||
| # # get the set of compressed labels | |||||
| # labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||||
| ## all_labels_ori.update(labels_comp) | |||||
| # all_num_of_each_label[ig] = dict(Counter(labels_comp)) | |||||
| # all_set_unique = list(all_set_unique) | |||||
| # @todo: parallel this part. | |||||
| for idx, G in enumerate(Gn): | |||||
| all_multisets = [] | |||||
| for node, attrs in G.nodes(data=True): | |||||
| # Multiset-label determination. | |||||
| multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = [attrs[node_label]] + multiset # add the prefix | |||||
| all_multisets.append(tuple(multiset)) | |||||
| # label compression | |||||
| set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
| # a dictionary mapping original labels to new ones. | |||||
| set_compressed = {} | |||||
| # if a label occured before, assign its former compressed label, | |||||
| # else assign the number of labels occured + 1 as the compressed label. | |||||
| for value in set_unique: | |||||
| if value in all_set_compressed.keys(): | |||||
| set_compressed.update({value: all_set_compressed[value]}) | |||||
| else: | |||||
| set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||||
| num_of_labels_occured += 1 | |||||
| all_set_compressed.update(set_compressed) | |||||
| # relabel nodes | |||||
| for idx, node in enumerate(G.nodes()): | |||||
| G.nodes[node][node_label] = set_compressed[all_multisets[idx]] | |||||
| # get the set of compressed labels | |||||
| labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||||
| # all_labels_ori.update(labels_comp) | |||||
| all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||||
| compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||||
| return Kmatrix | |||||
| def wl_iteration(G, node_label): | |||||
| all_multisets = [] | |||||
| for node, attrs in G.nodes(data=True): | |||||
| # Multiset-label determination. | |||||
| multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = [attrs[node_label]] + multiset # add the prefix | |||||
| all_multisets.append(tuple(multiset)) | |||||
| # # label compression | |||||
| # set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
| return all_multisets | |||||
| # # a dictionary mapping original labels to new ones. | |||||
| # set_compressed = {} | |||||
| # # if a label occured before, assign its former compressed label, | |||||
| # # else assign the number of labels occured + 1 as the compressed label. | |||||
| # for value in set_unique: | |||||
| # if value in all_set_compressed.keys(): | |||||
| # set_compressed.update({value: all_set_compressed[value]}) | |||||
| # else: | |||||
| # set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||||
| # num_of_labels_occured += 1 | |||||
| # | |||||
| # all_set_compressed.update(set_compressed) | |||||
| # | |||||
| # # relabel nodes | |||||
| # for idx, node in enumerate(G.nodes()): | |||||
| # G.nodes[node][node_label] = set_compressed[all_multisets[idx]] | |||||
| # | |||||
| # # get the set of compressed labels | |||||
| # labels_comp = list(nx.get_node_attributes(G, node_label).values()) | |||||
| # all_labels_ori.update(labels_comp) | |||||
| # all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
| # return | |||||
| def wrapper_wl_iteration(node_label, itr_item): | |||||
| g = itr_item[0] | |||||
| i = itr_item[1] | |||||
| all_multisets = wl_iteration(g, node_label) | |||||
| return i, all_multisets | |||||
| def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): | |||||
| """Compute kernel matrix using the base kernel. | |||||
| """ | |||||
| if parallel == 'imap_unordered': | |||||
| # compute kernels. | |||||
| def init_worker(alllabels_toshare): | |||||
| global G_alllabels | |||||
| G_alllabels = alllabels_toshare | |||||
| do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose) | |||||
| elif parallel == None: | |||||
| for i in range(len(Kmatrix)): | |||||
| for j in range(i, len(Kmatrix)): | |||||
| Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i], | |||||
| all_num_of_each_label[j], Kmatrix[i][j]) | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel): | |||||
| """Compute the subtree kernel. | |||||
| """ | |||||
| labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | |||||
| vector1 = np.array([(num_of_each_label1[label] | |||||
| if (label in num_of_each_label1.keys()) else 0) | |||||
| for label in labels]) | |||||
| vector2 = np.array([(num_of_each_label2[label] | |||||
| if (label in num_of_each_label2.keys()) else 0) | |||||
| for label in labels]) | |||||
| kernel += np.dot(vector1, vector2) | |||||
| return kernel | |||||
| def wrapper_compute_subtree_kernel(Kmatrix, itr): | |||||
| i = itr[0] | |||||
| j = itr[1] | |||||
| return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j]) | |||||
| def _wl_spkernel_do(Gn, node_label, edge_label, height): | |||||
| """Calculate Weisfeiler-Lehman shortest path kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| height : int | |||||
| subtree height. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
| """ | |||||
| pass | |||||
| from pygraph.utils.utils import getSPGraph | |||||
| # init. | |||||
| height = int(height) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||||
| Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn | |||||
| # initial for height = 0 | |||||
| for i in range(0, len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| for e1 in Gn[i].edges(data = True): | |||||
| for e2 in Gn[j].edges(data = True): | |||||
| if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||||
| Kmatrix[i][j] += 1 | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # iterate each height | |||||
| for h in range(1, height + 1): | |||||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
| for G in Gn: # for each graph | |||||
| set_multisets = [] | |||||
| for node in G.nodes(data = True): | |||||
| # Multiset-label determination. | |||||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||||
| set_multisets.append(multiset) | |||||
| # label compression | |||||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||||
| # a dictionary mapping original labels to new ones. | |||||
| set_compressed = {} | |||||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||||
| for value in set_unique: | |||||
| if value in all_set_compressed.keys(): | |||||
| set_compressed.update({ value : all_set_compressed[value] }) | |||||
| else: | |||||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
| num_of_labels_occured += 1 | |||||
| all_set_compressed.update(set_compressed) | |||||
| # relabel nodes | |||||
| for node in G.nodes(data = True): | |||||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||||
| for i in range(0, len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| for e1 in Gn[i].edges(data = True): | |||||
| for e2 in Gn[j].edges(data = True): | |||||
| if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||||
| Kmatrix[i][j] += 1 | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| return Kmatrix | |||||
| def _wl_edgekernel_do(Gn, node_label, edge_label, height): | |||||
| """Calculate Weisfeiler-Lehman edge kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| height : int | |||||
| subtree height. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
| """ | |||||
| pass | |||||
| # init. | |||||
| height = int(height) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||||
| # initial for height = 0 | |||||
| for i in range(0, len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| for e1 in Gn[i].edges(data = True): | |||||
| for e2 in Gn[j].edges(data = True): | |||||
| if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||||
| Kmatrix[i][j] += 1 | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| # iterate each height | |||||
| for h in range(1, height + 1): | |||||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
| for G in Gn: # for each graph | |||||
| set_multisets = [] | |||||
| for node in G.nodes(data = True): | |||||
| # Multiset-label determination. | |||||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||||
| set_multisets.append(multiset) | |||||
| # label compression | |||||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||||
| # a dictionary mapping original labels to new ones. | |||||
| set_compressed = {} | |||||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||||
| for value in set_unique: | |||||
| if value in all_set_compressed.keys(): | |||||
| set_compressed.update({ value : all_set_compressed[value] }) | |||||
| else: | |||||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
| num_of_labels_occured += 1 | |||||
| all_set_compressed.update(set_compressed) | |||||
| # relabel nodes | |||||
| for node in G.nodes(data = True): | |||||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||||
| # calculate subtree kernel with h iterations and add it to the final kernel | |||||
| for i in range(0, len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| for e1 in Gn[i].edges(data = True): | |||||
| for e2 in Gn[j].edges(data = True): | |||||
| if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||||
| Kmatrix[i][j] += 1 | |||||
| Kmatrix[j][i] = Kmatrix[i][j] | |||||
| return Kmatrix | |||||
| def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): | |||||
| """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| height : int | |||||
| subtree height. | |||||
| base_kernel : string | |||||
| Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
| """ | |||||
| pass | |||||
| # init. | |||||
| height = int(height) | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel | |||||
| # initial for height = 0 | |||||
| Kmatrix = base_kernel(Gn, node_label, edge_label) | |||||
| # iterate each height | |||||
| for h in range(1, height + 1): | |||||
| all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
| num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
| for G in Gn: # for each graph | |||||
| set_multisets = [] | |||||
| for node in G.nodes(data = True): | |||||
| # Multiset-label determination. | |||||
| multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] | |||||
| # sorting each multiset | |||||
| multiset.sort() | |||||
| multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix | |||||
| set_multisets.append(multiset) | |||||
| # label compression | |||||
| set_unique = list(set(set_multisets)) # set of unique multiset labels | |||||
| # a dictionary mapping original labels to new ones. | |||||
| set_compressed = {} | |||||
| # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||||
| for value in set_unique: | |||||
| if value in all_set_compressed.keys(): | |||||
| set_compressed.update({ value : all_set_compressed[value] }) | |||||
| else: | |||||
| set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
| num_of_labels_occured += 1 | |||||
| all_set_compressed.update(set_compressed) | |||||
| # relabel nodes | |||||
| for node in G.nodes(data = True): | |||||
| node[1][node_label] = set_compressed[set_multisets[node[0]]] | |||||
| # calculate kernel with h iterations and add it to the final kernel | |||||
| Kmatrix += base_kernel(Gn, node_label, edge_label) | |||||
| return Kmatrix | |||||
| @@ -61,7 +61,7 @@ def polynomialkernel(x, y, d=1, c=0): | |||||
| """Polynomial kernel. | """Polynomial kernel. | ||||
| Compute the polynomial kernel between x and y: | Compute the polynomial kernel between x and y: | ||||
| K(x, y) = (x^Ty)^d + c. | |||||
| K(x, y) = <x, y> ^d + c. | |||||
| Parameters | Parameters | ||||
| ---------- | ---------- | ||||
| @@ -78,6 +78,27 @@ def polynomialkernel(x, y, d=1, c=0): | |||||
| return np.dot(x, y) ** d + c | return np.dot(x, y) ** d + c | ||||
| def linearkernel(x, y): | |||||
| """Polynomial kernel. | |||||
| Compute the polynomial kernel between x and y: | |||||
| K(x, y) = <x, y>. | |||||
| Parameters | |||||
| ---------- | |||||
| x, y : array | |||||
| d : integer, default 1 | |||||
| c : float, default 0 | |||||
| Returns | |||||
| ------- | |||||
| kernel : float | |||||
| """ | |||||
| return np.dot(x, y) | |||||
| def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | ||||
| """Sum of a pair of kernels. | """Sum of a pair of kernels. | ||||
| @@ -241,4 +241,22 @@ def graph_isIdentical(G1, G2): | |||||
| return False | return False | ||||
| # check graph attributes. | # check graph attributes. | ||||
| return True | |||||
| return True | |||||
| def get_node_labels(Gn, node_label): | |||||
| """Get node labels of dataset Gn. | |||||
| """ | |||||
| nl = set() | |||||
| for G in Gn: | |||||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
| return nl | |||||
| def get_edge_labels(Gn, edge_label): | |||||
| """Get edge labels of dataset Gn. | |||||
| """ | |||||
| el = set() | |||||
| for G in Gn: | |||||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
| return el | |||||