New translations weisfeilerLehmanKernel.py (Chinese Simplified)

5 years ago · 83ad1949b0
--- a/lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py
+++ b/lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py
@@ -0,0 +1,570 @@
 """
@author: linlin

@references:

 	[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
 	Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
 	2011;12(Sep):2539-61.
 """

 import sys
 from collections import Counter
 from functools import partial
 import time
 #from multiprocessing import Pool
 from tqdm import tqdm

 import networkx as nx
 import numpy as np

 #from gklearn.kernels.pathKernel import pathkernel
 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.parallel import parallel_gm

 # @todo: support edge kernel, sp kernel, user-defined kernel.
 def weisfeilerlehmankernel(*args, 
 						   node_label='atom',
 						   edge_label='bond_type',
 						   height=0,
 						   base_kernel='subtree',
 						   parallel=None,
 						   n_jobs=None, 
 						   chunksize=None,
 						   verbose=True):
 	"""Calculate Weisfeiler-Lehman kernels between graphs.
 	
 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.
 	
 	G1, G2 : NetworkX graphs
 		Two graphs between which the kernel is calculated.		

 	node_label : string
 		Node attribute used as label. The default node label is atom.		

 	edge_label : string
 		Edge attribute used as label. The default edge label is bond_type.		

 	height : int
 		Subtree height.

 	base_kernel : string
 		Base kernel used in each iteration of WL kernel. Only default 'subtree' 
 		kernel can be applied for now.

 	parallel : None
 		Which paralleliztion method is applied to compute the kernel. No 
 		parallelization can be applied for now.

 	n_jobs : int
 		Number of jobs for parallelization. The default is to use all 
 		computational cores. This argument is only valid when one of the 
 		parallelization method is applied and can be ignored for now.

 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

 	Notes
 	-----
 	This function now supports WL subtree kernel only.
 	"""
 #		The default base 
 #		kernel is subtree kernel. For user-defined kernel, base_kernel is the 
 #		name of the base kernel function used in each iteration of WL kernel. 
 #		This function returns a Numpy matrix, each element of which is the 
 #		user-defined Weisfeiler-Lehman kernel between 2 praphs.
 	# pre-process
 	base_kernel = base_kernel.lower()
 	Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
 	Gn = [g.copy() for g in Gn]
 	ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], 
 									  node_label=node_label)
 	if not ds_attrs['node_labeled']:
 		for G in Gn:
 			nx.set_node_attributes(G, '0', 'atom')

 	start_time = time.time()

 	# for WL subtree kernel
 	if base_kernel == 'subtree':		   
 		Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose)

 	# for WL shortest path kernel
 	elif base_kernel == 'sp':
 		Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

 	# for WL edge kernel
 	elif base_kernel == 'edge':
 		Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

 	# for user defined base kernel
 	else:
 		Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)

 	run_time = time.time() - start_time
 	if verbose:
 		print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" 
 			  % (base_kernel, len(args[0]), run_time))

 	return Kmatrix, run_time


 def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose):
 	"""Calculate Weisfeiler-Lehman kernels between graphs.

 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.	   
 	node_label : string
 		node attribute used as label.
 	edge_label : string
 		edge attribute used as label.	  
 	height : int
 		wl height.

 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
 	"""
 	height = int(height)
 	Kmatrix = np.zeros((len(Gn), len(Gn)))

 	# initial for height = 0
 	all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

 	# for each graph
 	for G in Gn:
 		# get the set of original labels
 		labels_ori = list(nx.get_node_attributes(G, node_label).values())
 		# number of occurence of each label in G
 		all_num_of_each_label.append(dict(Counter(labels_ori)))

 	# calculate subtree kernel with the 0th iteration and add it to the final kernel
 	compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False)

 	# iterate each height
 	for h in range(1, height + 1):
 		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
 		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
 #		all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
 		all_num_of_each_label = [] # number of occurence of each label in G

 #		# for each graph
 #		# ---- use pool.imap_unordered to parallel and track progress. ----
 #		pool = Pool(n_jobs)
 #		itr = zip(Gn, range(0, len(Gn)))
 #		if len(Gn) < 100 * n_jobs:
 #			chunksize = int(len(Gn) / n_jobs) + 1
 #		else:
 #			chunksize = 100
 #		all_multisets_list = [[] for _ in range(len(Gn))]
 ##		set_unique_list = [[] for _ in range(len(Gn))]
 #		get_partial = partial(wrapper_wl_iteration, node_label)
 ##		if verbose:
 ##			iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
 ##							desc='wl iteration', file=sys.stdout)
 ##		else:
 #		iterator = pool.imap_unordered(get_partial, itr, chunksize)
 #		for i, all_multisets in iterator:
 #			all_multisets_list[i] = all_multisets
 ##			set_unique_list[i] = set_unique
 ##			all_set_unique = all_set_unique | set(set_unique)
 #		pool.close()
 #		pool.join()
 		
 #		all_set_unique = set()
 #		for uset in all_multisets_list:
 #			all_set_unique = all_set_unique | set(uset)
 #			
 #		all_set_unique = list(all_set_unique)
 ##		# a dictionary mapping original labels to new ones. 
 ##		set_compressed = {}
 ##		for idx, uset in enumerate(all_set_unique):
 ##			set_compressed.update({uset: idx})
 #			
 #		for ig, G in enumerate(Gn):
 #
 ##			# a dictionary mapping original labels to new ones. 
 ##			set_compressed = {}
 ##			# if a label occured before, assign its former compressed label, 
 ##			# else assign the number of labels occured + 1 as the compressed label. 
 ##			for value in set_unique_list[i]:
 ##				if uset in all_set_unique:
 ##					set_compressed.update({uset: all_set_compressed[value]})
 ##				else:
 ##					set_compressed.update({value: str(num_of_labels_occured + 1)})
 ##					num_of_labels_occured += 1
 #					
 ##			all_set_compressed.update(set_compressed)
 #			
 #			# relabel nodes
 #			for idx, node in enumerate(G.nodes()):
 #				G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
 #				
 #			# get the set of compressed labels
 #			labels_comp = list(nx.get_node_attributes(G, node_label).values())
 ##			all_labels_ori.update(labels_comp)
 #			all_num_of_each_label[ig] = dict(Counter(labels_comp))
 			
 			

 		
 #		all_set_unique = list(all_set_unique)
 		
 		
 		# @todo: parallel this part.
 		for idx, G in enumerate(Gn):

 			all_multisets = []
 			for node, attrs in G.nodes(data=True):
 				# Multiset-label determination.
 				multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
 				# sorting each multiset
 				multiset.sort()
 				multiset = [attrs[node_label]] + multiset # add the prefix 
 				all_multisets.append(tuple(multiset))

 			# label compression
 			set_unique = list(set(all_multisets)) # set of unique multiset labels
 			# a dictionary mapping original labels to new ones. 
 			set_compressed = {}
 			# if a label occured before, assign its former compressed label, 
 			# else assign the number of labels occured + 1 as the compressed label. 
 			for value in set_unique:
 				if value in all_set_compressed.keys():
 					set_compressed.update({value: all_set_compressed[value]})
 				else:
 					set_compressed.update({value: str(num_of_labels_occured + 1)})
 					num_of_labels_occured += 1

 			all_set_compressed.update(set_compressed)

 			# relabel nodes
 			for idx, node in enumerate(G.nodes()):
 				G.nodes[node][node_label] = set_compressed[all_multisets[idx]]

 			# get the set of compressed labels
 			labels_comp = list(nx.get_node_attributes(G, node_label).values())
 #			all_labels_ori.update(labels_comp)
 			all_num_of_each_label.append(dict(Counter(labels_comp)))

 		# calculate subtree kernel with h iterations and add it to the final kernel
 		compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False)

 	return Kmatrix


 def wl_iteration(G, node_label):
 	all_multisets = []
 	for node, attrs in G.nodes(data=True):
 		# Multiset-label determination.
 		multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
 		# sorting each multiset
 		multiset.sort()
 		multiset = [attrs[node_label]] + multiset # add the prefix 
 		all_multisets.append(tuple(multiset))
 #	# label compression
 #	set_unique = list(set(all_multisets)) # set of unique multiset labels
 	return all_multisets
 	
 #	# a dictionary mapping original labels to new ones. 
 #	set_compressed = {}
 #	# if a label occured before, assign its former compressed label, 
 #	# else assign the number of labels occured + 1 as the compressed label. 
 #	for value in set_unique:
 #		if value in all_set_compressed.keys():
 #			set_compressed.update({value: all_set_compressed[value]})
 #		else:
 #			set_compressed.update({value: str(num_of_labels_occured + 1)})
 #			num_of_labels_occured += 1
 #
 #	all_set_compressed.update(set_compressed)
 #
 #	# relabel nodes
 #	for idx, node in enumerate(G.nodes()):
 #		G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
 #
 #	# get the set of compressed labels
 #	labels_comp = list(nx.get_node_attributes(G, node_label).values())
 #	all_labels_ori.update(labels_comp)
 #	all_num_of_each_label.append(dict(Counter(labels_comp)))
 #	return


 def wrapper_wl_iteration(node_label, itr_item):
 	g = itr_item[0]
 	i = itr_item[1]
 	all_multisets = wl_iteration(g, node_label)
 	return i, all_multisets


 def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose):
 	"""Compute kernel matrix using the base kernel.
 	"""
 	if parallel == 'imap_unordered':
 		# compute kernels.
 		def init_worker(alllabels_toshare):
 			global G_alllabels
 			G_alllabels = alllabels_toshare
 		do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
 		parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
 					glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
 	elif parallel == None:
 		for i in range(len(Kmatrix)):
 			for j in range(i, len(Kmatrix)):
 				Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
 					   all_num_of_each_label[j], Kmatrix[i][j])
 				Kmatrix[j][i] = Kmatrix[i][j]


 def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
 	"""Compute the subtree kernel.
 	"""
 	labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
 	vector1 = np.array([(num_of_each_label1[label] 
 						if (label in num_of_each_label1.keys()) else 0) 
 						for label in labels])
 	vector2 = np.array([(num_of_each_label2[label] 
 						if (label in num_of_each_label2.keys()) else 0) 
 						for label in labels])
 	kernel += np.dot(vector1, vector2)
 	return kernel


 def wrapper_compute_subtree_kernel(Kmatrix, itr):
 	i = itr[0]
 	j = itr[1]
 	return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
 				

 def _wl_spkernel_do(Gn, node_label, edge_label, height):
 	"""Calculate Weisfeiler-Lehman shortest path kernels between graphs.
 	
 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.	   
 	node_label : string
 		node attribute used as label.	  
 	edge_label : string
 		edge attribute used as label.	   
 	height : int
 		subtree height.
 		
 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
 	"""
 	pass
 	from gklearn.utils.utils import getSPGraph
 	  
 	# init.
 	height = int(height)
 	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel

 	Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
 	
 	# initial for height = 0
 	for i in range(0, len(Gn)):
 		for j in range(i, len(Gn)):
 			for e1 in Gn[i].edges(data = True):
 				for e2 in Gn[j].edges(data = True):		  
 					if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
 						Kmatrix[i][j] += 1
 			Kmatrix[j][i] = Kmatrix[i][j]
 			
 	# iterate each height
 	for h in range(1, height + 1):
 		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
 		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
 		for G in Gn: # for each graph
 			set_multisets = []
 			for node in G.nodes(data = True):
 				# Multiset-label determination.
 				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
 				# sorting each multiset
 				multiset.sort()
 				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
 				set_multisets.append(multiset)		  

 			# label compression
 			set_unique = list(set(set_multisets)) # set of unique multiset labels
 			# a dictionary mapping original labels to new ones. 
 			set_compressed = {}
 			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
 			for value in set_unique:
 				if value in all_set_compressed.keys():
 					set_compressed.update({ value : all_set_compressed[value] })
 				else:
 					set_compressed.update({ value : str(num_of_labels_occured + 1) })
 					num_of_labels_occured += 1

 			all_set_compressed.update(set_compressed)
 			
 			# relabel nodes
 			for node in G.nodes(data = True):
 				node[1][node_label] = set_compressed[set_multisets[node[0]]]
 				
 		# calculate subtree kernel with h iterations and add it to the final kernel
 		for i in range(0, len(Gn)):
 			for j in range(i, len(Gn)):
 				for e1 in Gn[i].edges(data = True):
 					for e2 in Gn[j].edges(data = True):		  
 						if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
 							Kmatrix[i][j] += 1
 				Kmatrix[j][i] = Kmatrix[i][j]
 		
 	return Kmatrix



 def _wl_edgekernel_do(Gn, node_label, edge_label, height):
 	"""Calculate Weisfeiler-Lehman edge kernels between graphs.
 	
 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.	   
 	node_label : string
 		node attribute used as label.	  
 	edge_label : string
 		edge attribute used as label.	   
 	height : int
 		subtree height.
 		
 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
 	"""	  
 	pass
 	# init.
 	height = int(height)
 	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
 	# initial for height = 0
 	for i in range(0, len(Gn)):
 		for j in range(i, len(Gn)):
 			for e1 in Gn[i].edges(data = True):
 				for e2 in Gn[j].edges(data = True):		  
 					if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
 						Kmatrix[i][j] += 1
 			Kmatrix[j][i] = Kmatrix[i][j]
 			
 	# iterate each height
 	for h in range(1, height + 1):
 		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
 		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
 		for G in Gn: # for each graph
 			set_multisets = []			
 			for node in G.nodes(data = True):
 				# Multiset-label determination.
 				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
 				# sorting each multiset
 				multiset.sort()
 				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
 				set_multisets.append(multiset)		  

 			# label compression
 			set_unique = list(set(set_multisets)) # set of unique multiset labels
 			# a dictionary mapping original labels to new ones. 
 			set_compressed = {}
 			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
 			for value in set_unique:
 				if value in all_set_compressed.keys():
 					set_compressed.update({ value : all_set_compressed[value] })
 				else:
 					set_compressed.update({ value : str(num_of_labels_occured + 1) })
 					num_of_labels_occured += 1

 			all_set_compressed.update(set_compressed)
 			
 			# relabel nodes
 			for node in G.nodes(data = True):
 				node[1][node_label] = set_compressed[set_multisets[node[0]]]
 				
 		# calculate subtree kernel with h iterations and add it to the final kernel
 		for i in range(0, len(Gn)):
 			for j in range(i, len(Gn)):
 				for e1 in Gn[i].edges(data = True):
 					for e2 in Gn[j].edges(data = True):		  
 						if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
 							Kmatrix[i][j] += 1
 				Kmatrix[j][i] = Kmatrix[i][j]
 		
 	return Kmatrix


 def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
 	"""Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
 	
 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.	   
 	node_label : string
 		node attribute used as label.	  
 	edge_label : string
 		edge attribute used as label.	   
 	height : int
 		subtree height.
 	base_kernel : string
 		Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
 		
 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
 	"""	  
 	pass
 	# init.
 	height = int(height)
 	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
 	# initial for height = 0
 	Kmatrix = base_kernel(Gn, node_label, edge_label)
 			
 	# iterate each height
 	for h in range(1, height + 1):
 		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
 		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
 		for G in Gn: # for each graph
 			set_multisets = []		   
 			for node in G.nodes(data = True):
 				# Multiset-label determination.
 				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
 				# sorting each multiset
 				multiset.sort()
 				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
 				set_multisets.append(multiset)		  

 			# label compression
 			set_unique = list(set(set_multisets)) # set of unique multiset labels
 			# a dictionary mapping original labels to new ones. 
 			set_compressed = {}
 			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
 			for value in set_unique:
 				if value in all_set_compressed.keys():
 					set_compressed.update({ value : all_set_compressed[value] })
 				else:
 					set_compressed.update({ value : str(num_of_labels_occured + 1) })
 					num_of_labels_occured += 1

 			all_set_compressed.update(set_compressed)
 			
 			# relabel nodes
 			for node in G.nodes(data = True):
 				node[1][node_label] = set_compressed[set_multisets[node[0]]]
 				
 		# calculate kernel with h iterations and add it to the final kernel
 		Kmatrix += base_kernel(Gn, node_label, edge_label)
 		
 	return Kmatrix