You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor_handler.py 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Define the tensor stream handler."""
  16. from collections import namedtuple
  17. import numpy as np
  18. from mindinsight.datavisual.data_transform.graph.node import NodeTypeEnum
  19. from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
  20. from mindinsight.debugger.common.log import LOGGER as log
  21. from mindinsight.debugger.proto.ms_graph_pb2 import DataType
  22. from mindinsight.debugger.stream_cache.tensor import OpTensor, ConstTensor
  23. from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase
  24. from mindinsight.utils.tensor import TensorUtils, TensorComparison
  25. TensorBasicInfo = namedtuple('tensor_basic_info', ['full_name', 'node_type', 'iter'])
  26. class TensorHandler(StreamHandlerBase):
  27. """Metadata Handler."""
  28. def __init__(self):
  29. self._const_vals = {}
  30. self._tensors = {}
  31. self._cur_step = 0
  32. @property
  33. def cur_step(self):
  34. """The property of current step."""
  35. return self._cur_step
  36. @property
  37. def prev_step(self):
  38. """The property of previous step."""
  39. return self._cur_step - 1
  40. def put(self, value):
  41. """
  42. Put value into tensor cache. Called by grpc server.
  43. Args:
  44. value (dict): The Tensor proto message.
  45. - step (int): The current step of tensor.
  46. - tensor_protos (list[TensorProto]): The tensor proto.
  47. Returns:
  48. bool, the tensor has updated successfully.
  49. """
  50. tensor_protos = value.get('tensor_protos')
  51. merged_tensor = self._get_merged_tensor(tensor_protos)
  52. step = value.get('step', 0)
  53. if merged_tensor.iter and step > 0:
  54. log.debug("Received previous tensor.")
  55. step -= 1
  56. tensor = OpTensor(merged_tensor, step)
  57. flag = self._put_tensor_into_cache(tensor, step)
  58. log.info("Put tensor %s of step: %d, into cache. Flag: %s", tensor.name, step, flag)
  59. return flag
  60. @staticmethod
  61. def _get_merged_tensor(tensor_protos):
  62. """
  63. Merged list of parsed tensor value into one.
  64. Args:
  65. tensor_protos (list[TensorProto]): List of tensor proto.
  66. Returns:
  67. TensorProto, merged tensor proto.
  68. """
  69. merged_tensor = tensor_protos[-1]
  70. if len(tensor_protos) > 1:
  71. tensor_value = bytes()
  72. for tensor_proto in tensor_protos:
  73. if not tensor_proto.tensor_content:
  74. log.warning("Doesn't find tensor value for %s:%s",
  75. tensor_proto.node_name, tensor_proto.slot)
  76. break
  77. tensor_value += tensor_proto.tensor_content
  78. merged_tensor.tensor_content = tensor_value
  79. log.debug("Merge multi tensor values into one.")
  80. return merged_tensor
  81. def _put_tensor_into_cache(self, tensor, step):
  82. """
  83. Put tensor into cache.
  84. Args:
  85. tensor (OpTensor): The tensor value.
  86. step (int): The step of tensor.
  87. Returns:
  88. bool, the tensor has updated successfully.
  89. """
  90. cache_tensor = self._tensors.get(tensor.name)
  91. if cache_tensor is None:
  92. cache_tensor = {}
  93. self._tensors[tensor.name] = cache_tensor
  94. old_tensor = cache_tensor.get(step)
  95. if old_tensor and not self._is_value_diff(old_tensor.value, tensor.value):
  96. log.debug("Tensor %s of step %s has no change. Ignore it.", tensor.name, step)
  97. return False
  98. cache_tensor[step] = tensor
  99. log.debug("Put updated tensor value for %s of step %s.", tensor.name, step)
  100. return True
  101. @staticmethod
  102. def _is_value_diff(old_value, new_value):
  103. """Check tensor value if there are equal."""
  104. log.debug("old value type: %s, new_value type: %s", type(old_value), type(new_value))
  105. if old_value is None and new_value is None:
  106. return False
  107. flag = old_value != new_value
  108. if isinstance(flag, np.ndarray):
  109. return flag.any()
  110. return flag
  111. def put_const_vals(self, const_vals):
  112. """
  113. Put const value into tensor cache.
  114. Args:
  115. const_vals (list[NamedValueProto]): List of const values.
  116. """
  117. for const_val in const_vals:
  118. if not (const_val.value and const_val.key):
  119. continue
  120. if DataType.Name(const_val.value.dtype) == "DT_TENSOR":
  121. tensor_proto = const_val.value.tensor_val
  122. tensor_proto.node_name = const_val.key
  123. tensor_proto.slot = '0'
  124. const_tensor = OpTensor(tensor_proto)
  125. else:
  126. const_tensor = ConstTensor(const_val)
  127. self._const_vals[const_tensor.name] = const_tensor
  128. def get(self, filter_condition=None):
  129. """
  130. Get full tensor value.
  131. Args:
  132. filter_condition (dict): Filter condition.
  133. - name (str): The full name of tensor.
  134. - node_type (str): The type of the node.
  135. - prev (bool): Whether to get previous tensor.
  136. Returns:
  137. dict, the tensor_value.
  138. """
  139. name = filter_condition.get('name')
  140. node_type = filter_condition.get('node_type')
  141. shape = filter_condition.get('shape')
  142. if filter_condition.get('prev'):
  143. step = self.prev_step
  144. else:
  145. step = self.cur_step
  146. tensor = self._get_tensor(name, node_type, step)
  147. if not tensor:
  148. log.error("No tensor named %s at the step %s", name, step)
  149. raise DebuggerParamValueError("No tensor named {}".format(name))
  150. tensor_info = tensor.get_full_info(shape)
  151. self._update_has_prev_step_field(tensor_info, name, node_type)
  152. return {'tensor_value': tensor_info}
  153. def _get_tensor(self, tensor_name, node_type=None, step=None):
  154. """
  155. Get tensor according to tensor name and node_type.
  156. Args:
  157. tensor_name (str): Tensor name, format like `node_name:slot`.
  158. node_type (str): Node type.
  159. step (int): The step of tensor info. Default: None.
  160. Returns:
  161. Union[OPTensor, ConstTensor], the tensor object.
  162. """
  163. if step is None:
  164. step = self._cur_step
  165. tensor = self._tensors.get(tensor_name, {}).get(step)
  166. if not tensor and node_type == NodeTypeEnum.CONST.value:
  167. const_name = tensor_name.rsplit('/', 1)[-1]
  168. tensor = self._const_vals.get(const_name)
  169. if tensor:
  170. self._tensors[tensor_name] = {step: tensor}
  171. return tensor
  172. def _get_basic_info(self, tensor_name, node_type=None):
  173. """Get the latest basic tensor info by tensor name."""
  174. tensor = self._get_tensor(tensor_name, node_type)
  175. if tensor:
  176. return tensor.get_basic_info()
  177. return None
  178. def update_tensor_history(self, tensor_history):
  179. """
  180. Add tensor basic info in tensor_history.
  181. Args:
  182. tensor_history (dict): Tensor history, including a list of tensor name and type.
  183. Returns:
  184. list[dict], the list of tensor basic info cache.
  185. """
  186. missed_tensors = []
  187. for tensor_info in tensor_history.get('tensor_history'):
  188. tensor_name = tensor_info.get('full_name')
  189. node_type = tensor_info.get('node_type')
  190. basic_info = self._get_basic_info(tensor_name, node_type)
  191. # add `has_prev_step` field to tensor basic info.
  192. missing_tensors_info = self._update_has_prev_step_field(basic_info, tensor_name, node_type)
  193. if basic_info:
  194. tensor_info.update(basic_info)
  195. if missing_tensors_info:
  196. missed_tensors.extend(missing_tensors_info)
  197. return missed_tensors
  198. def _update_has_prev_step_field(self, tensor_info, tensor_name, node_type):
  199. """Update has_prev_step field in tensor info."""
  200. missing_tensors_info = self._get_missing_tensor_info(tensor_name, node_type)
  201. if not missing_tensors_info and node_type == NodeTypeEnum.PARAMETER.value and self.cur_step > 0:
  202. tensor_info['has_prev_step'] = True
  203. return missing_tensors_info
  204. def _get_missing_tensor_info(self, tensor_name, node_type):
  205. """
  206. Get missing tensor infos.
  207. Args:
  208. tensor_name (str): The full name of Tensor.
  209. node_type (str): The type of the relative node.
  210. Returns:
  211. list, list of missing tensor basic information.
  212. """
  213. step = self.cur_step
  214. missing_tensors_info = []
  215. # check the current step value is missing
  216. if self._is_tensor_value_missing(tensor_name, step):
  217. missing_tensors_info.append(TensorBasicInfo(full_name=tensor_name, node_type=node_type, iter=''))
  218. log.debug("Add current step view cmd for %s", tensor_name)
  219. # check the previous step value is missing
  220. if node_type == NodeTypeEnum.PARAMETER.value and self._is_tensor_value_missing(tensor_name, step - 1):
  221. missing_tensors_info.append(TensorBasicInfo(full_name=tensor_name, node_type=node_type, iter='prev'))
  222. log.debug("Add previous view cmd for %s", tensor_name)
  223. return missing_tensors_info
  224. def _is_tensor_value_missing(self, tensor_name, step):
  225. """
  226. Get the status of tensor value of previous step.
  227. Args:
  228. tensor_name (str): Tensor name.
  229. step (int): The step of the tensor.
  230. Returns:
  231. Union[None, bool], the status of tensor value. If False, there is valid
  232. tensor value. If True, the tensor value should be queried from client.
  233. If None, ignore.
  234. """
  235. if step < 0:
  236. return None
  237. tensor = self._get_tensor(tensor_name, step=step)
  238. return bool(not tensor or tensor.empty)
  239. def get_valid_tensor_by_name(self, tensor_name, prev=False):
  240. """Get tensor value by name in numpy type."""
  241. step = self.prev_step if prev else self.cur_step
  242. if step < 0:
  243. log.warning("%d step has no previous value for tensor: %s", self.cur_step, tensor_name)
  244. return None
  245. tensor = self._get_tensor(tensor_name, step=step)
  246. if tensor and tensor.empty:
  247. log.warning("%s has empty value.", tensor_name)
  248. return None
  249. return tensor
  250. def clean_tensors(self, cur_step):
  251. """Clean the tensor cache."""
  252. self._cur_step = cur_step
  253. expired_tensor = []
  254. for tensor_name, tensor in self._tensors.items():
  255. expired_step = [step for step in tensor.keys() if step <= cur_step - 2]
  256. for step in expired_step:
  257. tensor.pop(step)
  258. if not tensor:
  259. expired_tensor.append(tensor_name)
  260. for tensor_name in expired_tensor:
  261. self._tensors.pop(tensor_name)
  262. def get_tensors_diff(self, tensor_name, shape, tolerance=0):
  263. """
  264. Get tensor comparisons data for given name, detail, shape and tolerance.
  265. Args:
  266. tensor_name (str): The name of tensor for cache.
  267. shape (tuple): Specify concrete dimensions of shape.
  268. tolerance (str): Specify tolerance of difference between current step tensor and previous
  269. step tensor. Default value is 0. Its is a percentage. The boundary value is equal to
  270. max(abs(min),abs(max)) * tolerance. The function of min and max is being used to
  271. calculate the min value and max value of the result of the current step tensor subtract
  272. the previous step tensor. If the absolute value of result is less than or equal to
  273. boundary value, the result will set to be zero.
  274. Raises:
  275. DebuggerParamValueError, If get current step node and previous step node failed or
  276. the type of tensor value is not numpy.ndarray."
  277. Returns:
  278. dict, the retrieved data.
  279. """
  280. curr_tensor = self.get_valid_tensor_by_name(tensor_name)
  281. prev_tensor = self.get_valid_tensor_by_name(tensor_name, prev=True)
  282. if not (curr_tensor and prev_tensor):
  283. log.error("Get current step and previous step for this tensor name %s failed.", tensor_name)
  284. raise DebuggerParamValueError(f"Get current step and previous step for this tensor name "
  285. f"{tensor_name} failed.")
  286. curr_tensor_slice = curr_tensor.get_tensor_value_by_shape(shape)
  287. prev_tensor_slice = prev_tensor.get_tensor_value_by_shape(shape)
  288. # get tensor comparison basic info
  289. tensor_info = curr_tensor.get_basic_info()
  290. tensor_info.pop('has_prev_step')
  291. tensor_info.pop('value')
  292. # calculate tensor comparision object
  293. tensor_comparison = curr_tensor.tensor_comparison
  294. if not tensor_comparison or tensor_comparison.tolerance != tolerance:
  295. if curr_tensor.value.shape != prev_tensor.value.shape:
  296. raise DebuggerParamValueError("The shape of these two step tensors is not the same.")
  297. tensor_diff = TensorUtils.calc_diff_between_two_tensor(curr_tensor.value, prev_tensor.value, tolerance)
  298. stats = TensorUtils.get_statistics_from_tensor(tensor_diff)
  299. tensor_comparison = TensorComparison(tolerance, stats, tensor_diff)
  300. curr_tensor.update_tensor_comparisons(tensor_comparison)
  301. # calculate diff value
  302. # the type of curr_tensor_slice is one of np.ndarray or str
  303. if isinstance(curr_tensor_slice, np.ndarray) and isinstance(prev_tensor_slice, np.ndarray):
  304. if not shape:
  305. tensor_diff_slice = tensor_comparison.value
  306. else:
  307. tensor_diff_slice = tensor_comparison.value[shape]
  308. result = np.stack([prev_tensor_slice, curr_tensor_slice, tensor_diff_slice], axis=-1)
  309. tensor_info['diff'] = result.tolist()
  310. elif isinstance(curr_tensor_slice, str):
  311. tensor_info['diff'] = curr_tensor_slice
  312. # add comparision statistics
  313. tensor_info.update(self._get_comparison_statistics(curr_tensor, prev_tensor))
  314. reply = {'tensor_value': tensor_info}
  315. return reply
  316. @staticmethod
  317. def _get_comparison_statistics(curr_tensor, prev_tensor):
  318. """Get comparison statistics."""
  319. stats_info = {}
  320. diff_tensor_stats = curr_tensor.tensor_comparison.stats
  321. curr_tensor_stats = TensorUtils.get_statistics_from_tensor(curr_tensor.value)
  322. prev_tensor_stats = TensorUtils.get_statistics_from_tensor(prev_tensor.value)
  323. stats_info['curr_step_statistics'] = TensorUtils.get_overall_statistic_dict(overall_stats=curr_tensor_stats)
  324. stats_info['prev_step_statistics'] = TensorUtils.get_overall_statistic_dict(overall_stats=prev_tensor_stats)
  325. stats_info['statistics'] = TensorUtils.get_overall_statistic_dict(overall_stats=diff_tensor_stats)
  326. return stats_info
  327. def get_tensor_info_for_tensor_graph(self, tensor_name, node_type):
  328. """
  329. Get Tensor info for tensor graphs.
  330. Args:
  331. tensor_name (str): Tensor name, format like `node_name:slot`.
  332. node_type (str): Node type.
  333. Returns:
  334. dict, tensor infos, including overall statistics, tensor shape and has_prev_step info.
  335. list, list of missing tensor basic information.
  336. """
  337. res = {}
  338. tensor = self._get_tensor(tensor_name, node_type)
  339. if tensor and not tensor.empty:
  340. res['statistics'] = tensor.get_tensor_statistics()
  341. res['shape'] = tensor.shape
  342. missing_tensors = self._update_has_prev_step_field(res, tensor_name, node_type)
  343. return res, missing_tensors