You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graph_handler.py 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Define the graph stream handler."""
  16. from mindinsight.debugger.conditionmgr.common.utils import NodeBasicInfo
  17. from mindinsight.debugger.conditionmgr.condition import TargetTypeEnum as CategoryTypeEnum
  18. from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
  19. DebuggerNodeNotInGraphError, DebuggerGraphNotExistError
  20. from mindinsight.debugger.common.log import LOGGER as log
  21. from mindinsight.debugger.common.utils import is_scope_type
  22. from mindinsight.debugger.stream_cache.debugger_graph import DebuggerGraph
  23. from mindinsight.debugger.stream_cache.debugger_multigraph import DebuggerMultiGraph
  24. from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase
  25. class GraphHandler(StreamHandlerBase):
  26. """Metadata Handler."""
  27. def __init__(self):
  28. # dict of <graph_name, GraphProto object>
  29. self._graph_proto = {}
  30. # dict of <graph_name, DebuggerGraph object>
  31. self._graph = {}
  32. self._searched_node_list = {}
  33. # list of node names in bfs order
  34. self.bfs_order = []
  35. # dict of <node full name, graph_name>
  36. self.graph_node_map = {}
  37. # dict of <node ui name, Node object> for all graphs
  38. self._all_leaf_nodes = {}
  39. # the whole graph
  40. self._whole_graph = None
  41. @property
  42. def whole_graph(self):
  43. """The property of whole_graph."""
  44. return self._whole_graph
  45. @property
  46. def graph(self):
  47. """The property of graph."""
  48. return self._graph_proto
  49. @property
  50. def graph_names(self):
  51. """The property of graph names."""
  52. return list(self._graph)
  53. @property
  54. def debugger_graph_obj(self):
  55. """The property of graph object."""
  56. return self._graph
  57. def put(self, value):
  58. """
  59. Put value into graph cache. Called by grpc server.
  60. Args:
  61. value (GraphProto): The Graph proto message.
  62. """
  63. log.info("Put graph into cache.")
  64. sorted_value_list = self._sort_graph(value)
  65. for graph_name, graph_value in sorted_value_list:
  66. self._graph_proto[graph_name] = graph_value
  67. # build sub graph
  68. graph = DebuggerGraph()
  69. graph.build_graph(graph_value)
  70. self._graph[graph_name] = graph
  71. self.bfs_order.extend(graph.get_bfs_order())
  72. leaf_nodes = graph.leaf_nodes
  73. self._all_leaf_nodes.update(leaf_nodes)
  74. for _, node in leaf_nodes.items():
  75. self.graph_node_map[node.full_name] = graph_name
  76. # build whole graph
  77. graph = DebuggerMultiGraph()
  78. graph.add_graph(self._graph)
  79. self._whole_graph = graph
  80. def get(self, filter_condition=None):
  81. """
  82. Get the graph of specific node.
  83. Args:
  84. filter_condition (dict):
  85. - name (str): The full debug node name.
  86. - graph_name (str): The relative graph_name of the node.
  87. - single_node (bool): If True, return the graph from root
  88. to the specific node; else, return the sublayer of the
  89. graph. Default: False.
  90. Returns:
  91. dict, the metadata.
  92. """
  93. try:
  94. self._graph_exists()
  95. except DebuggerGraphNotExistError:
  96. log.warning('The graph is empty. To view a graph, '
  97. 'please start the training script first.')
  98. return {'graph': {}}
  99. graph = {}
  100. if filter_condition is None:
  101. filter_condition = {}
  102. graph = {'graph_names': self.graph_names}
  103. single_node = filter_condition.get('single_node', False)
  104. name = filter_condition.get('name')
  105. graph_name = filter_condition.get('graph_name')
  106. if single_node is True:
  107. nodes = self._get_single_node(name, graph_name)
  108. else:
  109. nodes = self._list_nodes(name, graph_name)
  110. graph.update(nodes)
  111. return {'graph': graph}
  112. def _get_single_node(self, name, graph_name=None):
  113. """
  114. Search node, and return every layer nodes until this node.
  115. Args:
  116. graph_name(str): The graph_name.
  117. name (str): The name of node.
  118. Returns:
  119. dict, every layer nodes until this node.
  120. """
  121. if graph_name:
  122. graph = self._get_graph(graph_name=graph_name)
  123. searched_graph = graph.search_single_node(name)
  124. else:
  125. searched_graph = self._whole_graph.search_single_node(name)
  126. return searched_graph
  127. def _list_nodes(self, scope, graph_name):
  128. """
  129. Get the nodes of every layer in graph.
  130. Args:
  131. scope (str): The name of a scope.
  132. graph_name(str): The graph name.
  133. Returns:
  134. TypedDict{'nodes': ['Node_1', ...], 'graph_names': ['graph_name_1', ...]},
  135. format is {'nodes': [<NodeObject>], 'graph_names': [<str>]}.
  136. example:
  137. {
  138. "nodes" : [
  139. {
  140. "attr" :
  141. {
  142. "index" : "i: 0\n"
  143. },
  144. "input" : {},
  145. "name" : "input_tensor",
  146. "output" :
  147. {
  148. "Default/TensorAdd-op17" :
  149. {
  150. "edge_type" : "data",
  151. "scope" : "name_scope",
  152. "shape" : [1, 16, 128, 128]
  153. }
  154. },
  155. "output_i" : -1,
  156. "proxy_input" : {},
  157. "proxy_output" : {},
  158. "independent_layout" : False,
  159. "subnode_count" : 0,
  160. "type" : "Data"
  161. }
  162. ]
  163. }
  164. """
  165. if graph_name:
  166. graph = self._get_graph(graph_name, scope)
  167. nodes = graph.list_node_by_scope(scope=scope)
  168. res = {'nodes': nodes}
  169. else:
  170. nodes = self._whole_graph.list_node_by_scope(scope=scope)
  171. res = {'nodes': nodes}
  172. return res
  173. def get_tensor_history(self, node_name, graph_name=None, depth=0):
  174. """
  175. Get the tensor history of a specified node.
  176. Args:
  177. node_name (str): The debug name of the node.
  178. graph_name (str): The graph_name. Default: None.
  179. depth (int): The number of layers the user
  180. wants to trace. Default is 0.
  181. Returns:
  182. dict, basic tensor history, only including tensor name and tensor type and node type.
  183. """
  184. graph_name, node_name = self._parse_node_name(node_name, graph_name)
  185. graph = self._get_graph(graph_name=graph_name, node_name=node_name)
  186. # validate node type, scope node has no tensor history
  187. node_type = graph.get_node_type(node_name)
  188. if is_scope_type(node_type):
  189. log.error("Scope type node has no tensor history.")
  190. raise DebuggerParamValueError("Invalid leaf node name.")
  191. # get tensor history
  192. tensor_history, cur_outputs_nums = graph.get_tensor_history(node_name, depth)
  193. # add the tensor type for tensor history
  194. self._update_tensor_history(tensor_history[0:cur_outputs_nums], 'output', graph_name)
  195. self._update_tensor_history(tensor_history[cur_outputs_nums:], 'input', graph_name)
  196. log.debug("Get %d tensors in tensor history for node <%s>.", len(tensor_history), node_name)
  197. return {'tensor_history': tensor_history}
  198. @staticmethod
  199. def _update_tensor_history(tensor_history, tensor_type, graph_name):
  200. """
  201. Add tensor source type for tensor history.
  202. Args:
  203. tensor_history (list[dict]): Tensor history from Graph stream. Each element has two
  204. keys: `node_type` and `name`. `node_type` refers to the type of the node which
  205. the tensor come from. `name` refers to the tensor name.
  206. tensor_type (str): The source type of the tensor. `input` or `output`.
  207. graph_name (str): The graph name.
  208. """
  209. for single_tensor_info in tensor_history:
  210. single_tensor_info['type'] = tensor_type
  211. single_tensor_info['graph_name'] = graph_name
  212. def search_nodes(self, pattern):
  213. """
  214. Search nodes by given pattern.
  215. Args:
  216. pattern (dict): Filter condition.
  217. - name (str): The name pattern.
  218. - graph_name (str): The graph name.
  219. - node_category (str): The node_category. Default: None
  220. - condition (dict): The additional filter condition.
  221. Returns:
  222. dict, the searched node.
  223. """
  224. graph_name = pattern.pop('graph_name', None)
  225. search_nodes = self.search_in_graph(pattern, graph_name)
  226. # construct to search tree
  227. graph = self._get_graph(graph_name=graph_name)
  228. format_nodes = graph.get_nodes(search_nodes)
  229. return {'nodes': format_nodes}
  230. def search_in_graph(self, pattern, graph_name=None):
  231. """
  232. Search nodes by given pattern.
  233. Args:
  234. pattern (dict): Filter condition.
  235. - name (str): The name pattern.
  236. - node_category (str): The node_category. Default: None.
  237. - condition (dict): The additional filter condition.
  238. graph_name (str): The graph name.
  239. Returns:
  240. list, the searched node list.
  241. """
  242. temp_node_list = []
  243. node_category = pattern.get('node_category')
  244. graph = self._get_graph(graph_name=graph_name)
  245. # filter nodes by name
  246. if pattern.get('name'):
  247. if node_category:
  248. # get leaf nodes for forward filter
  249. temp_node_list = graph.search_leaf_nodes_by_pattern(pattern.get('name'))
  250. else:
  251. # optimize search nodes
  252. temp_node_list = graph.search_nodes_by_pattern(pattern.get('name'))
  253. if not temp_node_list:
  254. log.debug("No node named %s", pattern.get('name'))
  255. return []
  256. # filter nodes by category
  257. if node_category:
  258. node_category = self._get_inner_node_category(node_category)
  259. condition = pattern['condition'].copy() if pattern.get('condition') else {}
  260. condition['search_range'] = temp_node_list
  261. temp_node_list = graph.search_nodes_by_category(node_category, condition=condition)
  262. return temp_node_list
  263. @staticmethod
  264. def _get_inner_node_category(node_category):
  265. """
  266. Get inner node category.
  267. Args:
  268. node_category (str): The node category supported in
  269. mindinsight.conditionmgr.condition.TargetTypeEnum.
  270. Returns:
  271. CategoryTypeEnum, the translated value.
  272. """
  273. try:
  274. res = CategoryTypeEnum(node_category)
  275. except ValueError as err:
  276. log.error("Invalid node category. %s", err)
  277. raise DebuggerParamValueError("Invalid node_category.")
  278. return res
  279. def get_nodes_by_scope(self, scope_name, graph_name):
  280. """
  281. Get node by a given scope name.
  282. Args:
  283. scope_name (str): The name of scope.
  284. graph_name (str): The relative graph_name of the watched node. Default: None.
  285. Returns:
  286. list[Node], a list of node.
  287. """
  288. if graph_name:
  289. graph = self._get_graph(graph_name)
  290. else:
  291. graph = self._whole_graph
  292. return graph.search_leaf_nodes_by_pattern(scope_name)
  293. def get_graph_id_by_name(self, node_name):
  294. """
  295. Get graph id by full name.
  296. Args:
  297. node_name (str): The name of the node.
  298. Returns:
  299. str, the graph name of the node.
  300. Raises:
  301. DebuggerNodeNotInGraphError: If can not find the node in all graphs.
  302. """
  303. if node_name:
  304. for graph_name, sub_graph in self._graph.items():
  305. if sub_graph.exist_node(name=node_name):
  306. return graph_name
  307. log.error('Failed to find node %s in graph. Please make sure the graph has been sent and '
  308. 'the node name is correct, and try again.', node_name)
  309. raise DebuggerGraphNotExistError
  310. def get_graph_id_by_full_name(self, node_name):
  311. """
  312. Get graph id by full name.
  313. Args:
  314. node_name (str): The full name of the node.
  315. Returns:
  316. str, the graph name of the node.
  317. Raises:
  318. DebuggerNodeNotInGraphError: If can not find the node in all graphs.
  319. """
  320. graph_id = self.graph_node_map.get(node_name) if node_name else None
  321. if not graph_id:
  322. log.warning("Failed to get graph id by full name: %s", node_name)
  323. return graph_id
  324. def get_node_type(self, node_name, graph_name=None):
  325. """
  326. Get the type of the specified node.
  327. Args:
  328. node_name (str): The debug name of the node.
  329. graph_name (str): The relative graph_name of the node. Default: None.
  330. Returns:
  331. A string of the node type, name_scope or leaf.
  332. """
  333. if graph_name:
  334. graph = self._get_graph(node_name=node_name, graph_name=graph_name)
  335. else:
  336. graph = self._whole_graph
  337. node_type = graph.get_node_type(node_name)
  338. return node_type
  339. def get_full_name(self, node_name, graph_name=None):
  340. """Get full name according to ui node name."""
  341. full_name = ''
  342. if node_name:
  343. graph = self._get_graph(node_name=node_name, graph_name=graph_name)
  344. full_name = graph.get_full_name_by_node_name(node_name)
  345. return full_name
  346. def get_node_basic_info(self, node_name, graph_name):
  347. """Get node basic info with graph scope."""
  348. graph_name, node_name = self._parse_node_name(node_name=node_name, graph_name=graph_name)
  349. graph = self._get_graph(graph_name, node_name)
  350. full_name = graph.get_full_name_by_node_name(node_name)
  351. node_type = graph.get_node_type(node_name)
  352. return self.construct_node_basic_info(full_name, graph_name, node_name, node_type)
  353. def get_tensor_graph(self, tensor_name, graph_name):
  354. """
  355. Get tensor graph according to node name.
  356. Args:
  357. tensor_name (str): Tensor name from UI, format is "node_name:slot".
  358. graph_name (str): The relative graph_name of the node. Default: None.
  359. Returns:
  360. dict, relative node.
  361. """
  362. node_name, _ = tensor_name.rsplit(':', 1)
  363. graph = self._get_graph(graph_name=graph_name, node_name=node_name)
  364. tensor_graph = graph.get_tensor_graph(node_name)
  365. return {'graph': tensor_graph}
  366. @staticmethod
  367. def construct_node_basic_info(full_name, graph_name, node_name, node_type):
  368. """Construct node basic info."""
  369. node_name_with_graph_scope = '/'.join([graph_name, node_name]) if node_name else graph_name
  370. return NodeBasicInfo(name=node_name_with_graph_scope, full_name=full_name, type=node_type)
  371. def get_node_basic_info_by_scope(self, scope_name, graph_name):
  372. """
  373. Get node by a given scope name.
  374. Args:
  375. scope_name (str): The name of scope.
  376. graph_name (str): The relative graph_name of the watched node. Default: None.
  377. Returns:
  378. list[NodeBasicInfo], a list of node.
  379. """
  380. graph_name, node_name = self._parse_node_name(scope_name, graph_name)
  381. graph = self._get_graph(graph_name)
  382. nodes = graph.search_leaf_nodes_by_pattern(node_name)
  383. res = [self.construct_node_basic_info(full_name=node.full_name,
  384. graph_name=graph_name,
  385. node_name=node.name,
  386. node_type=node.type) for node in nodes]
  387. return res
  388. def get_node_name_by_full_name(self, full_name, graph_name):
  389. """Get UI node name by full name and graph name."""
  390. if graph_name and full_name:
  391. graph = self._get_graph(graph_name)
  392. node_name = graph.get_node_name_by_full_name(full_name)
  393. else:
  394. node_name = ''
  395. log.debug("Get empty full name.")
  396. return node_name
  397. def get_node_by_bfs_order(self, node_name=None, ascend=True):
  398. """
  399. Traverse the graph in order of breath-first search by given node.
  400. Args:
  401. node_name (str): The name of current chosen leaf node.
  402. ascend (bool): If True, traverse the input nodes;
  403. If False, traverse the output nodes. Default is True.
  404. Returns:
  405. Union[None, dict], the next node object in dict type or None.
  406. """
  407. bfs_order = self.bfs_order
  408. length = len(bfs_order)
  409. if not bfs_order:
  410. log.error('Cannot get the BFS order of the graph!')
  411. msg = 'Cannot get the BFS order of the graph!'
  412. raise DebuggerParamValueError(msg)
  413. if node_name is None:
  414. if ascend is False:
  415. next_node = None
  416. else:
  417. next_node = bfs_order[0]
  418. else:
  419. try:
  420. index = bfs_order.index(node_name)
  421. log.debug("The index of the node in BFS list is: %d", index)
  422. except ValueError as err:
  423. log.error('Cannot find the node: %s. Please check '
  424. 'the node name: %s', node_name, err)
  425. msg = f'Cannot find the node: {node_name}. ' \
  426. f'Please check the node name {err}.'
  427. raise DebuggerParamValueError(msg)
  428. next_node = self._get_next_node_in_bfs(index, length, ascend)
  429. return next_node
  430. def _get_next_node_in_bfs(self, index, length, ascend):
  431. """
  432. Get the next node in bfs order.
  433. Args:
  434. index (int): The current index.
  435. length (int): The number of all leaf nodes.
  436. ascend (bool): Whether get the node in ascend order or not.
  437. Returns:
  438. Union[None, dict], the next node object in dict type or None.
  439. """
  440. next_node = None
  441. if 0 <= index < length:
  442. if ascend is True and index < length - 1:
  443. next_node = self.bfs_order[index + 1]
  444. elif ascend is False and index > 0:
  445. next_node = self.bfs_order[index - 1]
  446. return next_node
  447. def _graph_exists(self):
  448. """
  449. Check if the graph has been loaded in the debugger cache.
  450. Raises:
  451. DebuggerGraphNotExistError: If the graph does not exist.
  452. """
  453. if not self._graph:
  454. log.error('The graph does not exist. Please start the '
  455. 'training script and try again.')
  456. raise DebuggerGraphNotExistError
  457. def _get_graph(self, graph_name=None, node_name=None):
  458. """
  459. Get the graph object according to graph name and node name.
  460. Args:
  461. graph_name (str): The graph name.
  462. node_name (str): The node name.
  463. Returns:
  464. DebuggerGraph, the graph object.
  465. Raises:
  466. DebuggerGraphNotExistError: If the graph does not exist.
  467. """
  468. graph = self._graph.get(graph_name) if graph_name else self._whole_graph
  469. # get graph according to graph name and check the node
  470. if graph and (not node_name or graph.exist_node(name=node_name)):
  471. return graph
  472. log.error('The graph %s does not exist node %s.', graph_name, node_name)
  473. raise DebuggerGraphNotExistError
  474. def _has_graph_scope(self, graph_name):
  475. """Check if query with graph_scope."""
  476. return bool(graph_name is None and len(self._graph) > 1)
  477. def validate_graph_name(self, graph_name):
  478. """Validate graph_name."""
  479. if graph_name and self._graph.get(graph_name) is None:
  480. log.error("No graph named %s in debugger cache.", graph_name)
  481. raise DebuggerGraphNotExistError
  482. if not graph_name and len(self._graph) == 1:
  483. graph_name = self.graph_names[0]
  484. return graph_name
  485. def _add_graph_scope_for_nodes(self, nodes, graph_name):
  486. """
  487. Add graph scope for nodes.
  488. Args:
  489. nodes (list[Node]): List of nodes object.
  490. graph_name (str): The graph name.
  491. """
  492. def _get_updated_node_info(cur_node, node_type):
  493. """Add graph scope in key."""
  494. old_node = cur_node.get(node_type)
  495. if not old_node:
  496. return
  497. new_values = {}
  498. for old_name, node_info in old_node.items():
  499. new_name = '/'.join([graph_name, old_name]) if old_name else graph_name
  500. new_values[new_name] = node_info
  501. cur_node[node_type] = new_values
  502. for node in nodes:
  503. node['name'] = '/'.join([graph_name, node['name']]) if node['name'] else graph_name
  504. _get_updated_node_info(node, 'input')
  505. _get_updated_node_info(node, 'output')
  506. if node.get('nodes'):
  507. self._add_graph_scope_for_nodes(node.get('nodes'), graph_name)
  508. def _parse_node_name(self, node_name, graph_name):
  509. """
  510. Check if the node name should have graph scope.
  511. Args:
  512. node_name (str): The ui node name.
  513. graph_name (str): The graph name.
  514. Returns:
  515. str, parsed graph name.
  516. str, parsed node name.
  517. """
  518. node_name = '' if node_name is None else node_name
  519. if self._has_graph_scope(graph_name):
  520. names = node_name.split("/", 1)
  521. graph_name = names[0]
  522. node_name = names[1] if len(names) == 2 else ''
  523. if graph_name is None and len(self._graph) == 1:
  524. graph_name = self.graph_names[0]
  525. return graph_name, node_name
  526. def validate_node_name(self, node_name, graph_name):
  527. """
  528. Validate the graph exist the specified node.
  529. Args:
  530. node_name (str): The ui node name.
  531. graph_name (str): The graph name.
  532. Raises:
  533. DebuggerNodeNotInGraphError: If can not find the node in all graphs.
  534. """
  535. graph = self._get_graph(graph_name=graph_name)
  536. if not graph.exist_node(name=node_name):
  537. log.error("graph %s doesn't find node: %s.", graph_name, node_name)
  538. raise DebuggerNodeNotInGraphError(node_name)
  539. def _sort_graph(self, graphs):
  540. """
  541. Sort graph by graph_name.
  542. Args:
  543. graphs(dict): <graph_name, GraphProto object>.
  544. """
  545. if len(graphs) == 1:
  546. return graphs.items()
  547. sorted_graphs = sorted(graphs.items(), key=lambda x: get_graph_number(x[0]))
  548. return sorted_graphs
  549. def get_graph_number(graph_name):
  550. number = graph_name.split("_")[-1]
  551. return int(number)